[ { "title": "$p$-Laplacian Based Graph Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16019", "id": "16019", "proceeding": "https://proceedings.mlr.press/v162/fu22e.html", "poster": "", "slides": "/media/icml-2022/Slides/16019_ouxQWes.pdf", "author_site": "Guoji Fu, Peilin Zhao, Yatao Bian", "author": "Guoji Fu; Peilin Zhao; Yatao Bian", "abstract": "Graph neural networks (GNNs) have demonstrated superior performance for semi-supervised node classification on graphs, as a result of their ability to exploit node features and topological information simultaneously. However, most GNNs implicitly assume that the labels of nodes and their neighbors in a graph are the same or consistent, which does not hold in heterophilic graphs, where the labels of linked nodes are likely to differ. Moreover, when the topology is non-informative for label prediction, ordinary GNNs may work significantly worse than simply applying multi-layer perceptrons (MLPs) on each node. To tackle the above problem, we propose a new $p$-Laplacian based GNN model, termed as $^p$GNN, whose message passing mechanism is derived from a discrete regularization framework and could be theoretically explained as an approximation of a polynomial graph filter defined on the spectral domain of $p$-Laplacians. The spectral analysis shows that the new message passing mechanism works as low-high-pass filters, thus making $^p$GNNs are effective on both homophilic and heterophilic graphs. Empirical studies on real-world and synthetic datasets validate our findings and demonstrate that $^p$GNNs significantly outperform several state-of-the-art GNN architectures on heterophilic benchmarks while achieving competitive performance on homophilic benchmarks. Moreover, $^p$GNNs can adaptively learn aggregation weights and are robust to noisy edges.", "bibtex": "@InProceedings{pmlr-v162-fu22e,\n title = \t {$p$-{L}aplacian Based Graph Neural Networks},\n author = {Fu, Guoji and Zhao, Peilin and Bian, Yatao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6878--6917},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fu22e/fu22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/fu22e.html},\n abstract = \t {Graph neural networks (GNNs) have demonstrated superior performance for semi-supervised node classification on graphs, as a result of their ability to exploit node features and topological information simultaneously. However, most GNNs implicitly assume that the labels of nodes and their neighbors in a graph are the same or consistent, which does not hold in heterophilic graphs, where the labels of linked nodes are likely to differ. Moreover, when the topology is non-informative for label prediction, ordinary GNNs may work significantly worse than simply applying multi-layer perceptrons (MLPs) on each node. To tackle the above problem, we propose a new $p$-Laplacian based GNN model, termed as $^p$GNN, whose message passing mechanism is derived from a discrete regularization framework and could be theoretically explained as an approximation of a polynomial graph filter defined on the spectral domain of $p$-Laplacians. The spectral analysis shows that the new message passing mechanism works as low-high-pass filters, thus making $^p$GNNs are effective on both homophilic and heterophilic graphs. Empirical studies on real-world and synthetic datasets validate our findings and demonstrate that $^p$GNNs significantly outperform several state-of-the-art GNN architectures on heterophilic benchmarks while achieving competitive performance on homophilic benchmarks. Moreover, $^p$GNNs can adaptively learn aggregation weights and are robust to noisy edges.}\n}", "pdf": "https://proceedings.mlr.press/v162/fu22e/fu22e.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/fu22e-supp.zip", "pdf_size": 3087296, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15123165040444629585&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Tencent AI Lab, Shenzhen, China; Tencent AI Lab, Shenzhen, China; Tencent AI Lab, Shenzhen, China", "aff_domain": "gmail.com; ;gmail.com", "email": "gmail.com; ;gmail.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/fu22e.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Tencent", "aff_unique_dep": "AI Lab", "aff_unique_url": "https://ai.tencent.com", "aff_unique_abbr": "Tencent AI Lab", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "(Non-)Convergence Results for Predictive Coding Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18155", "id": "18155", "proceeding": "https://proceedings.mlr.press/v162/frieder22a.html", "poster": "/media/PosterPDFs/ICML%202022/42d0c639a21482dcd8e1c447efe87e89_4ZBJE0c.png?t=1656507202.9186318", "slides": "", "author_site": "Simon Frieder, Thomas Lukasiewicz", "author": "Simon Frieder; Thomas Lukasiewicz", "abstract": "Predictive coding networks (PCNs) are (un)supervised learning models, coming from neuroscience, that approximate how the brain works. One major open problem around PCNs is their convergence behavior. In this paper, we use dynamical systems theory to formally investigate the convergence of PCNs as they are used in machine learning. Doing so, we put their theory on a firm, rigorous basis, by developing a precise mathematical framework for PCN and show that for sufficiently small weights and initializations, PCNs converge for any input. Thereby, we provide the theoretical assurance that previous implementations, whose convergence was assessed solely by numerical experiments, can indeed capture the correct behavior of PCNs. Outside of the identified regime of small weights and small initializations, we show via a counterexample that PCNs can diverge, countering common beliefs held in the community. This is achieved by identifying a Neimark-Sacker bifurcation in a PCN of small size, which gives rise to an unstable fixed point and an invariant curve around it.", "bibtex": "@InProceedings{pmlr-v162-frieder22a,\n title = \t {({N}on-){C}onvergence Results for Predictive Coding Networks},\n author = {Frieder, Simon and Lukasiewicz, Thomas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6793--6810},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/frieder22a/frieder22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/frieder22a.html},\n abstract = \t {Predictive coding networks (PCNs) are (un)supervised learning models, coming from neuroscience, that approximate how the brain works. One major open problem around PCNs is their convergence behavior. In this paper, we use dynamical systems theory to formally investigate the convergence of PCNs as they are used in machine learning. Doing so, we put their theory on a firm, rigorous basis, by developing a precise mathematical framework for PCN and show that for sufficiently small weights and initializations, PCNs converge for any input. Thereby, we provide the theoretical assurance that previous implementations, whose convergence was assessed solely by numerical experiments, can indeed capture the correct behavior of PCNs. Outside of the identified regime of small weights and small initializations, we show via a counterexample that PCNs can diverge, countering common beliefs held in the community. This is achieved by identifying a Neimark-Sacker bifurcation in a PCN of small size, which gives rise to an unstable fixed point and an invariant curve around it.}\n}", "pdf": "https://proceedings.mlr.press/v162/frieder22a/frieder22a.pdf", "supp": "", "pdf_size": 660445, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1312279318779804765&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Computer Science, University of Oxford, UK; Institute of Logic and Computation, TU Wien, Austria", "aff_domain": "cs.ac.ox.uk; ", "email": "cs.ac.ox.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/frieder22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of Oxford;TU Wien", "aff_unique_dep": "Department of Computer Science;Institute of Logic and Computation", "aff_unique_url": "https://www.ox.ac.uk;https://www.tuwien.ac.at", "aff_unique_abbr": "Oxford;TU Wien", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;Austria" }, { "title": "3D Infomax improves GNNs for Molecular Property Prediction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17331", "id": "17331", "proceeding": "https://proceedings.mlr.press/v162/stark22a.html", "poster": "/media/PosterPDFs/ICML%202022/780965ae22ea6aee11935f3fb73da841.png?t=1657197147.0899732", "slides": "", "author_site": "Hannes St\u00e4rk, Dominique Beaini, Gabriele Corso, Prudencio Tossou, Christian Dallago, Stephan G\u00fcnnemann, Pietro Li\u00f3", "author": "Hannes St\u00e4rk; Dominique Beaini; Gabriele Corso; Prudencio Tossou; Christian Dallago; Stephan G\u00fcnnemann; Pietro Li\u00f3", "abstract": "Molecular property prediction is one of the fastest-growing applications of deep learning with critical real-world impacts. Although the 3D molecular graph structure is necessary for models to achieve strong performance on many tasks, it is infeasible to obtain 3D structures at the scale required by many real-world applications. To tackle this issue, we propose to use existing 3D molecular datasets to pre-train a model to reason about the geometry of molecules given only their 2D molecular graphs. Our method, called 3D Infomax, maximizes the mutual information between learned 3D summary vectors and the representations of a graph neural network (GNN). During fine-tuning on molecules with unknown geometry, the GNN is still able to produce implicit 3D information and uses it for downstream tasks. We show that 3D Infomax provides significant improvements for a wide range of properties, including a 22% average MAE reduction on QM9 quantum mechanical properties. Moreover, the learned representations can be effectively transferred between datasets in different molecular spaces.", "bibtex": "@InProceedings{pmlr-v162-stark22a,\n title = \t {3{D} Infomax improves {GNN}s for Molecular Property Prediction},\n author = {St{\\\"a}rk, Hannes and Beaini, Dominique and Corso, Gabriele and Tossou, Prudencio and Dallago, Christian and G{\\\"u}nnemann, Stephan and Li{\\'o}, Pietro},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20479--20502},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/stark22a/stark22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/stark22a.html},\n abstract = \t {Molecular property prediction is one of the fastest-growing applications of deep learning with critical real-world impacts. Although the 3D molecular graph structure is necessary for models to achieve strong performance on many tasks, it is infeasible to obtain 3D structures at the scale required by many real-world applications. To tackle this issue, we propose to use existing 3D molecular datasets to pre-train a model to reason about the geometry of molecules given only their 2D molecular graphs. Our method, called 3D Infomax, maximizes the mutual information between learned 3D summary vectors and the representations of a graph neural network (GNN). During fine-tuning on molecules with unknown geometry, the GNN is still able to produce implicit 3D information and uses it for downstream tasks. We show that 3D Infomax provides significant improvements for a wide range of properties, including a 22% average MAE reduction on QM9 quantum mechanical properties. Moreover, the learned representations can be effectively transferred between datasets in different molecular spaces.}\n}", "pdf": "https://proceedings.mlr.press/v162/stark22a/stark22a.pdf", "supp": "", "pdf_size": 1752113, "gs_citation": 294, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18195860750409632321&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "EECS, Massachusetts Institute of Technology, Cambridge MA, USA+Valence Discovery, Montreal, CA; Valence Discovery, Montreal, CA; EECS, Massachusetts Institute of Technology, Cambridge MA, USA; Valence Discovery, Montreal, CA; Department of Informatics, Technical University of Munich, DE; Department of Informatics, Technical University of Munich, DE; Department of Computer Science and Technology, University of Cambridge, UK", "aff_domain": "mit.edu; ; ; ; ; ; ", "email": "mit.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/stark22a.html", "aff_unique_index": "0+1;1;0;1;2;2;3", "aff_unique_norm": "Massachusetts Institute of Technology;Valence Discovery;Technical University of Munich;University of Cambridge", "aff_unique_dep": "EECS;;Department of Informatics;Department of Computer Science and Technology", "aff_unique_url": "https://web.mit.edu;;https://www.tum.de;https://www.cam.ac.uk", "aff_unique_abbr": "MIT;;TUM;Cambridge", "aff_campus_unique_index": "0+1;1;0;1;0", "aff_campus_unique": "Cambridge;Montreal;", "aff_country_unique_index": "0+1;1;0;1;2;2;3", "aff_country_unique": "United States;Canada;Germany;United Kingdom" }, { "title": "3DLinker: An E(3) Equivariant Variational Autoencoder for Molecular Linker Design", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18143", "id": "18143", "proceeding": "https://proceedings.mlr.press/v162/huang22g.html", "poster": "/media/PosterPDFs/ICML%202022/987b75e2727ae55289abd70d3f5864e6.png?t=1657998007.0431776", "slides": "/media/icml-2022/Slides/18143.pdf", "author_site": "Yinan Huang, Xingang Peng, Jianzhu Ma, Muhan Zhang", "author": "Yinan Huang; Xingang Peng; Jianzhu Ma; Muhan Zhang", "abstract": "Deep learning has achieved tremendous success in designing novel chemical compounds with desirable pharmaceutical properties. In this work, we focus on a new type of drug design problem \u2014 generating a small \u201clinker\u201d to physically attach two independent molecules with their distinct functions. The main computational challenges include: 1) the generation of linkers is conditional on the two given molecules, in contrast to generating complete molecules from scratch in previous works; 2) linkers heavily depend on the anchor atoms of the two molecules to be connected, which are not known beforehand; 3) 3D structures and orientations of the molecules need to be considered to avoid atom clashes, for which equivariance to E(3) group are necessary. To address these problems, we propose a conditional generative model, named 3DLinker, which is able to predict anchor atoms and jointly generate linker graphs and their 3D structures based on an E(3) equivariant graph variational autoencoder. So far as we know, no previous models could achieve this task. We compare our model with multiple conditional generative models modified from other molecular design tasks and find that our model has a significantly higher rate in recovering molecular graphs, and more importantly, accurately predicting the 3D coordinates of all the atoms.", "bibtex": "@InProceedings{pmlr-v162-huang22g,\n title = \t {3{DL}inker: An E(3) Equivariant Variational Autoencoder for Molecular Linker Design},\n author = {Huang, Yinan and Peng, Xingang and Ma, Jianzhu and Zhang, Muhan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9280--9294},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22g/huang22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22g.html},\n abstract = \t {Deep learning has achieved tremendous success in designing novel chemical compounds with desirable pharmaceutical properties. In this work, we focus on a new type of drug design problem \u2014 generating a small \u201clinker\u201d to physically attach two independent molecules with their distinct functions. The main computational challenges include: 1) the generation of linkers is conditional on the two given molecules, in contrast to generating complete molecules from scratch in previous works; 2) linkers heavily depend on the anchor atoms of the two molecules to be connected, which are not known beforehand; 3) 3D structures and orientations of the molecules need to be considered to avoid atom clashes, for which equivariance to E(3) group are necessary. To address these problems, we propose a conditional generative model, named 3DLinker, which is able to predict anchor atoms and jointly generate linker graphs and their 3D structures based on an E(3) equivariant graph variational autoencoder. So far as we know, no previous models could achieve this task. We compare our model with multiple conditional generative models modified from other molecular design tasks and find that our model has a significantly higher rate in recovering molecular graphs, and more importantly, accurately predicting the 3D coordinates of all the atoms.}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22g/huang22g.pdf", "supp": "", "pdf_size": 11710424, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4167500711441098938&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Beijing Institute for General Artificial Intelligence; Tsinghua University; Institute for Artificial Intelligence, Peking University; Beijing Institute for General Artificial Intelligence + Institute for Artificial Intelligence, Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn; ; ", "email": "pku.edu.cn;pku.edu.cn; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/huang22g.html", "aff_unique_index": "0;1;2;0+2", "aff_unique_norm": "Beijing Institute for General Artificial Intelligence;Tsinghua University;Peking University", "aff_unique_dep": ";;Institute for Artificial Intelligence", "aff_unique_url": "http://www.bigaiai.org/;https://www.tsinghua.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "BIGAI;THU;PKU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", "aff_country_unique": "China" }, { "title": "3PC: Three Point Compressors for Communication-Efficient Distributed Training and a Better Theory for Lazy Aggregation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18065", "id": "18065", "proceeding": "https://proceedings.mlr.press/v162/richtarik22a.html", "poster": "/media/PosterPDFs/ICML%202022/b6417f112bd27848533e54885b66c288_r5C1iLn.png?t=1658349135.936732", "slides": "", "author_site": "Peter Richtarik, Igor Sokolov, Elnur Gasanov, Ilyas Fatkhullin, Zhize Li, Eduard Gorbunov", "author": "Peter Richtarik; Igor Sokolov; Elnur Gasanov; Ilyas Fatkhullin; Zhize Li; Eduard Gorbunov", "abstract": "We propose and study a new class of gradient compressors for communication-efficient training\u2014three point compressors (3PC)\u2014as well as efficient distributed nonconvex optimization algorithms that can take advantage of them. Unlike most established approaches, which rely on a static compressor choice (e.g., TopK), our class allows the compressors to", "bibtex": "@InProceedings{pmlr-v162-richtarik22a,\n title = \t {3{PC}: Three Point Compressors for Communication-Efficient Distributed Training and a Better Theory for Lazy Aggregation},\n author = {Richtarik, Peter and Sokolov, Igor and Gasanov, Elnur and Fatkhullin, Ilyas and Li, Zhize and Gorbunov, Eduard},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18596--18648},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/richtarik22a/richtarik22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/richtarik22a.html},\n abstract = \t {We propose and study a new class of gradient compressors for communication-efficient training\u2014three point compressors (3PC)\u2014as well as efficient distributed nonconvex optimization algorithms that can take advantage of them. Unlike most established approaches, which rely on a static compressor choice (e.g., TopK), our class allows the compressors to", "pdf": "https://proceedings.mlr.press/v162/richtarik22a/richtarik22a.pdf", "supp": "", "pdf_size": 5198722, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11457289168159820638&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "Computer Science, King Abdullah University of Science and Technology, Thuwal, Saudi Arabia; Computer Science, King Abdullah University of Science and Technology, Thuwal, Saudi Arabia; ETH AI Center, Switzerland; Computer Science, King Abdullah University of Science and Technology, Thuwal, Saudi Arabia + ETH Zurich, Switzerland; Computer Science, King Abdullah University of Science and Technology, Thuwal, Saudi Arabia; Moscow Institute of Physics and Technology, Dolgoprudny, Russia", "aff_domain": "kaust.edu.sa; ; ; ; ; ", "email": "kaust.edu.sa; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/richtarik22a.html", "aff_unique_index": "0;0;1;0+1;0;2", "aff_unique_norm": "King Abdullah University of Science and Technology;ETH Zurich;Moscow Institute of Physics and Technology", "aff_unique_dep": "Computer Science;ETH AI Center;", "aff_unique_url": "https://www.kast.kau.edu.sa;https://www.ethz.ch;https://www.mipt.ru", "aff_unique_abbr": "KAUST;ETH;MIPT", "aff_campus_unique_index": "0;0;0;0;2", "aff_campus_unique": "Thuwal;;Dolgoprudny", "aff_country_unique_index": "0;0;1;0+1;0;2", "aff_country_unique": "Saudi Arabia;Switzerland;Russian Federation" }, { "title": "A Branch and Bound Framework for Stronger Adversarial Attacks of ReLU Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17077", "id": "17077", "proceeding": "https://proceedings.mlr.press/v162/zhang22ae.html", "poster": "/media/PosterPDFs/ICML%202022/77ef24b42fcecae4ba74fa68137f3e43.png?t=1658362912.7494664", "slides": "", "author_site": "Huan Zhang, Shiqi Wang, Kaidi Xu, Yihan Wang, Suman Jana, Cho-Jui Hsieh, Zico Kolter", "author": "Huan Zhang; Shiqi Wang; Kaidi Xu; Yihan Wang; Suman Jana; Cho-Jui Hsieh; Zico Kolter", "abstract": "Strong adversarial attacks are important for evaluating the true robustness of deep neural networks. Most existing attacks search in the input space, e.g., using gradient descent, and may miss adversarial examples due to non-convexity. In this work, we systematically search adversarial examples in the activation space of ReLU networks to tackle hard instances where none of the existing adversarial attacks succeed. Unfortunately, searching the activation space typically relies on generic mixed integer programming (MIP) solvers and is limited to small networks and easy problem instances. To improve scalability and practicability, we use branch and bound (BaB) with specialized GPU-based bound propagation methods, and propose a top-down beam-search approach to quickly identify the subspace that may contain adversarial examples. Moreover, we build an adversarial candidates pool using cheap attacks to further assist the search in activation space via diving techniques and a bottom-up large neighborhood search. Our adversarial attack framework, BaB-Attack, opens up a new opportunity for designing novel adversarial attacks not limited to searching the input space, and enables us to borrow techniques from integer programming theory and neural network verification. In experiments, we can successfully generate adversarial examples when existing attacks on input space fail. Compared to off-the-shelf MIP solver based attacks that requires significant computations, we outperform in both success rates and efficiency.", "bibtex": "@InProceedings{pmlr-v162-zhang22ae,\n title = \t {A Branch and Bound Framework for Stronger Adversarial Attacks of {R}e{LU} Networks},\n author = {Zhang, Huan and Wang, Shiqi and Xu, Kaidi and Wang, Yihan and Jana, Suman and Hsieh, Cho-Jui and Kolter, Zico},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26591--26604},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22ae/zhang22ae.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22ae.html},\n abstract = \t {Strong adversarial attacks are important for evaluating the true robustness of deep neural networks. Most existing attacks search in the input space, e.g., using gradient descent, and may miss adversarial examples due to non-convexity. In this work, we systematically search adversarial examples in the activation space of ReLU networks to tackle hard instances where none of the existing adversarial attacks succeed. Unfortunately, searching the activation space typically relies on generic mixed integer programming (MIP) solvers and is limited to small networks and easy problem instances. To improve scalability and practicability, we use branch and bound (BaB) with specialized GPU-based bound propagation methods, and propose a top-down beam-search approach to quickly identify the subspace that may contain adversarial examples. Moreover, we build an adversarial candidates pool using cheap attacks to further assist the search in activation space via diving techniques and a bottom-up large neighborhood search. Our adversarial attack framework, BaB-Attack, opens up a new opportunity for designing novel adversarial attacks not limited to searching the input space, and enables us to borrow techniques from integer programming theory and neural network verification. In experiments, we can successfully generate adversarial examples when existing attacks on input space fail. Compared to off-the-shelf MIP solver based attacks that requires significant computations, we outperform in both success rates and efficiency.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22ae/zhang22ae.pdf", "supp": "", "pdf_size": 1068857, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2786522381978800098&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Carnegie Mellon University; Columbia University; Drexel University; UCLA; Columbia University; UCLA; Carnegie Mellon University", "aff_domain": "huan-zhang.com;columbia.edu; ; ; ; ; ", "email": "huan-zhang.com;columbia.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/zhang22ae.html", "aff_unique_index": "0;1;2;3;1;3;0", "aff_unique_norm": "Carnegie Mellon University;Columbia University;Drexel University;University of California, Los Angeles", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cmu.edu;https://www.columbia.edu;https://www.drexel.edu;https://www.ucla.edu", "aff_unique_abbr": "CMU;Columbia;Drexel;UCLA", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Closer Look at Smoothness in Domain Adversarial Training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17769", "id": "17769", "proceeding": "https://proceedings.mlr.press/v162/rangwani22a.html", "poster": "/media/PosterPDFs/ICML%202022/4ff6fa96179cdc2838e8d8ce64cd10a7.png?t=1657784515.917734", "slides": "", "author_site": "Harsh Rangwani, Sumukh K Aithal, Mayank Mishra, Arihant Jain, Venkatesh Babu Radhakrishnan", "author": "Harsh Rangwani; Sumukh K Aithal; Mayank Mishra; Arihant Jain; Venkatesh Babu Radhakrishnan", "abstract": "Domain adversarial training has been ubiquitous for achieving invariant representations and is used widely for various domain adaptation tasks. In recent times, methods converging to smooth optima have shown improved generalization for supervised learning tasks like classification. In this work, we analyze the effect of smoothness enhancing formulations on domain adversarial training, the objective of which is a combination of task loss (eg. classification, regression etc.) and adversarial terms. We find that converging to a smooth minima with respect to (w.r.t.) task loss stabilizes the adversarial training leading to better performance on target domain. In contrast to task loss, our analysis shows that converging to smooth minima w.r.t. adversarial loss leads to sub-optimal generalization on the target domain. Based on the analysis, we introduce the Smooth Domain Adversarial Training (SDAT) procedure, which effectively enhances the performance of existing domain adversarial methods for both classification and object detection tasks. Our analysis also provides insight into the extensive usage of SGD over Adam in the community for domain adversarial training.", "bibtex": "@InProceedings{pmlr-v162-rangwani22a,\n title = \t {A Closer Look at Smoothness in Domain Adversarial Training},\n author = {Rangwani, Harsh and Aithal, Sumukh K and Mishra, Mayank and Jain, Arihant and Radhakrishnan, Venkatesh Babu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18378--18399},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rangwani22a/rangwani22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rangwani22a.html},\n abstract = \t {Domain adversarial training has been ubiquitous for achieving invariant representations and is used widely for various domain adaptation tasks. In recent times, methods converging to smooth optima have shown improved generalization for supervised learning tasks like classification. In this work, we analyze the effect of smoothness enhancing formulations on domain adversarial training, the objective of which is a combination of task loss (eg. classification, regression etc.) and adversarial terms. We find that converging to a smooth minima with respect to (w.r.t.) task loss stabilizes the adversarial training leading to better performance on target domain. In contrast to task loss, our analysis shows that converging to smooth minima w.r.t. adversarial loss leads to sub-optimal generalization on the target domain. Based on the analysis, we introduce the Smooth Domain Adversarial Training (SDAT) procedure, which effectively enhances the performance of existing domain adversarial methods for both classification and object detection tasks. Our analysis also provides insight into the extensive usage of SGD over Adam in the community for domain adversarial training.}\n}", "pdf": "https://proceedings.mlr.press/v162/rangwani22a/rangwani22a.pdf", "supp": "", "pdf_size": 1138510, "gs_citation": 157, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11164597139581450427&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Video Analytics Lab, Indian Institute of Science, Bengaluru, India+PES University, Bengaluru; Video Analytics Lab, Indian Institute of Science, Bengaluru, India; Video Analytics Lab, Indian Institute of Science, Bengaluru, India; Video Analytics Lab, Indian Institute of Science, Bengaluru, India+Amazon, India; Video Analytics Lab, Indian Institute of Science, Bengaluru, India", "aff_domain": "iisc.ac.in; ; ; ; ", "email": "iisc.ac.in; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/rangwani22a.html", "aff_unique_index": "0+1;0;0;0+2;0", "aff_unique_norm": "Indian Institute of Science;PES University;Amazon", "aff_unique_dep": "Video Analytics Lab;;Amazon", "aff_unique_url": "https://www.iisc.ac.in;https://pes.edu;https://www.amazon.in", "aff_unique_abbr": "IISc;PESU;Amazon", "aff_campus_unique_index": "0+0;0;0;0;0", "aff_campus_unique": "Bengaluru;", "aff_country_unique_index": "0+0;0;0;0+0;0", "aff_country_unique": "India" }, { "title": "A Completely Tuning-Free and Robust Approach to Sparse Precision Matrix Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17673", "id": "17673", "proceeding": "https://proceedings.mlr.press/v162/tran22b.html", "poster": "/media/PosterPDFs/ICML%202022/ce60ff163cab97029cc727e20e0fc3a7_XI110sd.png?t=1657498817.797107", "slides": "", "author_site": "Chau Tran, Guo Yu", "author": "Chau Tran; Guo Yu", "abstract": "Despite the vast literature on sparse Gaussian graphical models, current methods either are asymptotically tuning-free (which still require fine-tuning in practice) or hinge on computationally expensive methods (e.g., cross-validation) to determine the proper level of regularization. We propose a completely tuning-free approach for estimating sparse Gaussian graphical models. Our method uses model-agnostic regularization parameters to estimate each column of the target precision matrix and enjoys several desirable properties. Computationally, our estimator can be computed efficiently by linear programming. Theoretically, the proposed estimator achieves minimax optimal convergence rates under various norms. We further propose a second-stage enhancement with non-convex penalties which possesses the strong oracle property. Through comprehensive numerical studies, our methods demonstrate favorable statistical performance. Remarkably, our methods exhibit strong robustness to the violation of the Gaussian assumption and significantly outperform competing methods in the heavy-tailed settings.", "bibtex": "@InProceedings{pmlr-v162-tran22b,\n title = \t {A Completely Tuning-Free and Robust Approach to Sparse Precision Matrix Estimation},\n author = {Tran, Chau and Yu, Guo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21733--21750},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tran22b/tran22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/tran22b.html},\n abstract = \t {Despite the vast literature on sparse Gaussian graphical models, current methods either are asymptotically tuning-free (which still require fine-tuning in practice) or hinge on computationally expensive methods (e.g., cross-validation) to determine the proper level of regularization. We propose a completely tuning-free approach for estimating sparse Gaussian graphical models. Our method uses model-agnostic regularization parameters to estimate each column of the target precision matrix and enjoys several desirable properties. Computationally, our estimator can be computed efficiently by linear programming. Theoretically, the proposed estimator achieves minimax optimal convergence rates under various norms. We further propose a second-stage enhancement with non-convex penalties which possesses the strong oracle property. Through comprehensive numerical studies, our methods demonstrate favorable statistical performance. Remarkably, our methods exhibit strong robustness to the violation of the Gaussian assumption and significantly outperform competing methods in the heavy-tailed settings.}\n}", "pdf": "https://proceedings.mlr.press/v162/tran22b/tran22b.pdf", "supp": "", "pdf_size": 470454, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4378381086963600534&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Statistics and Applied Probability, University of California, Santa Barbara, CA, USA; Department of Statistics and Applied Probability, University of California, Santa Barbara, CA, USA", "aff_domain": "ucsb.edu; ", "email": "ucsb.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/tran22b.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Santa Barbara", "aff_unique_dep": "Department of Statistics and Applied Probability", "aff_unique_url": "https://www.ucsb.edu", "aff_unique_abbr": "UCSB", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Santa Barbara", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Consistent and Efficient Evaluation Strategy for Attribution Methods", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17561", "id": "17561", "proceeding": "https://proceedings.mlr.press/v162/rong22a.html", "poster": "/media/PosterPDFs/ICML%202022/f40438b554cc0e3d96ee6064c5798f55.png?t=1657282571.3368192", "slides": "", "author_site": "Yao Rong, Tobias Leemann, Vadim Borisov, Gjergji Kasneci, Enkelejda Kasneci", "author": "Yao Rong; Tobias Leemann; Vadim Borisov; Gjergji Kasneci; Enkelejda Kasneci", "abstract": "With a variety of local feature attribution methods being proposed in recent years, follow-up work suggested several evaluation strategies. To assess the attribution quality across different attribution techniques, the most popular among these evaluation strategies in the image domain use pixel perturbations. However, recent advances discovered that different evaluation strategies produce conflicting rankings of attribution methods and can be prohibitively expensive to compute. In this work, we present an information-theoretic analysis of evaluation strategies based on pixel perturbations. Our findings reveal that the results are strongly affected by information leakage through the shape of the removed pixels as opposed to their actual values. Using our theoretical insights, we propose a novel evaluation framework termed Remove and Debias (ROAD) which offers two contributions: First, it mitigates the impact of the confounders, which entails higher consistency among evaluation strategies. Second, ROAD does not require the computationally expensive retraining step and saves up to 99% in computational costs compared to the state-of-the-art. We release our source code at https://github.com/tleemann/road_evaluation.", "bibtex": "@InProceedings{pmlr-v162-rong22a,\n title = \t {A Consistent and Efficient Evaluation Strategy for Attribution Methods},\n author = {Rong, Yao and Leemann, Tobias and Borisov, Vadim and Kasneci, Gjergji and Kasneci, Enkelejda},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18770--18795},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rong22a/rong22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rong22a.html},\n abstract = \t {With a variety of local feature attribution methods being proposed in recent years, follow-up work suggested several evaluation strategies. To assess the attribution quality across different attribution techniques, the most popular among these evaluation strategies in the image domain use pixel perturbations. However, recent advances discovered that different evaluation strategies produce conflicting rankings of attribution methods and can be prohibitively expensive to compute. In this work, we present an information-theoretic analysis of evaluation strategies based on pixel perturbations. Our findings reveal that the results are strongly affected by information leakage through the shape of the removed pixels as opposed to their actual values. Using our theoretical insights, we propose a novel evaluation framework termed Remove and Debias (ROAD) which offers two contributions: First, it mitigates the impact of the confounders, which entails higher consistency among evaluation strategies. Second, ROAD does not require the computationally expensive retraining step and saves up to 99% in computational costs compared to the state-of-the-art. We release our source code at https://github.com/tleemann/road_evaluation.}\n}", "pdf": "https://proceedings.mlr.press/v162/rong22a/rong22a.pdf", "supp": "", "pdf_size": 19527872, "gs_citation": 137, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16933534039020294474&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, University of T\u00fcbingen, T\u00fcbingen, Germany; Department of Computer Science, University of T\u00fcbingen, T\u00fcbingen, Germany; Department of Computer Science, University of T\u00fcbingen, T\u00fcbingen, Germany; Department of Computer Science, University of T\u00fcbingen, T\u00fcbingen, Germany; Department of Computer Science, University of T\u00fcbingen, T\u00fcbingen, Germany", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de; ; ; ", "email": "uni-tuebingen.de;uni-tuebingen.de; ; ; ", "github": "https://github.com/tleemann/road_evaluation", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/rong22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of T\u00fcbingen", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.uni-tuebingen.de", "aff_unique_abbr": "", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "T\u00fcbingen", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "A Context-Integrated Transformer-Based Neural Network for Auction Design", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17743", "id": "17743", "proceeding": "https://proceedings.mlr.press/v162/duan22a.html", "poster": "/media/PosterPDFs/ICML%202022/21be9a4bd4f81549a9d1d241981cec3c.png?t=1656677865.2573583", "slides": "/media/icml-2022/Slides/17743.pdf", "author_site": "Zhijian Duan, Jingwu Tang, Yutong Yin, Zhe Feng, Xiang Yan, Manzil Zaheer, Xiaotie Deng", "author": "Zhijian Duan; Jingwu Tang; Yutong Yin; Zhe Feng; Xiang Yan; Manzil Zaheer; Xiaotie Deng", "abstract": "One of the central problems in auction design is developing an incentive-compatible mechanism that maximizes the auctioneer\u2019s expected revenue. While theoretical approaches have encountered bottlenecks in multi-item auctions, recently, there has been much progress on finding the optimal mechanism through deep learning. However, these works either focus on a fixed set of bidders and items, or restrict the auction to be symmetric. In this work, we overcome such limitations by factoring", "bibtex": "@InProceedings{pmlr-v162-duan22a,\n title = \t {A Context-Integrated Transformer-Based Neural Network for Auction Design},\n author = {Duan, Zhijian and Tang, Jingwu and Yin, Yutong and Feng, Zhe and Yan, Xiang and Zaheer, Manzil and Deng, Xiaotie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5609--5626},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/duan22a/duan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/duan22a.html},\n abstract = \t {One of the central problems in auction design is developing an incentive-compatible mechanism that maximizes the auctioneer\u2019s expected revenue. While theoretical approaches have encountered bottlenecks in multi-item auctions, recently, there has been much progress on finding the optimal mechanism through deep learning. However, these works either focus on a fixed set of bidders and items, or restrict the auction to be symmetric. In this work, we overcome such limitations by factoring", "pdf": "https://proceedings.mlr.press/v162/duan22a/duan22a.pdf", "supp": "", "pdf_size": 735313, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9850607820011561614&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Peking University; Peking University; Peking University; Google Research; Shanghai Jiao Tong University; Google DeepMind; Peking University", "aff_domain": "pku.edu.cn; ; ; ; ; ;pku.edu.cn", "email": "pku.edu.cn; ; ; ; ; ;pku.edu.cn", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/duan22a.html", "aff_unique_index": "0;0;0;1;2;1;0", "aff_unique_norm": "Peking University;Google;Shanghai Jiao Tong University", "aff_unique_dep": ";Google Research;", "aff_unique_url": "http://www.pku.edu.cn;https://research.google;https://www.sjtu.edu.cn", "aff_unique_abbr": "Peking U;Google Research;SJTU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;1;0;2;0", "aff_country_unique": "China;United States;United Kingdom" }, { "title": "A Convergence Theory for SVGD in the Population Limit under Talagrand\u2019s Inequality T1", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17593", "id": "17593", "proceeding": "https://proceedings.mlr.press/v162/salim22a.html", "poster": "", "slides": "", "author_site": "Adil Salim, Lukang Sun, Peter Richtarik", "author": "Adil Salim; Lukang Sun; Peter Richtarik", "abstract": "Stein Variational Gradient Descent (SVGD) is an algorithm for sampling from a target density which is known up to a multiplicative constant. Although SVGD is a popular algorithm in practice, its theoretical study is limited to a few recent works. We study the convergence of SVGD in the population limit, (i.e., with an infinite number of particles) to sample from a non-logconcave target distribution satisfying Talagrand\u2019s inequality T1. We first establish the convergence of the algorithm. Then, we establish a dimension-dependent complexity bound in terms of the Kernelized Stein Discrepancy (KSD). Unlike existing works, we do not assume that the KSD is bounded along the trajectory of the algorithm. Our approach relies on interpreting SVGD as a gradient descent over a space of probability measures.", "bibtex": "@InProceedings{pmlr-v162-salim22a,\n title = \t {A Convergence Theory for {SVGD} in the Population Limit under Talagrand\u2019s Inequality T1},\n author = {Salim, Adil and Sun, Lukang and Richtarik, Peter},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19139--19152},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/salim22a/salim22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/salim22a.html},\n abstract = \t {Stein Variational Gradient Descent (SVGD) is an algorithm for sampling from a target density which is known up to a multiplicative constant. Although SVGD is a popular algorithm in practice, its theoretical study is limited to a few recent works. We study the convergence of SVGD in the population limit, (i.e., with an infinite number of particles) to sample from a non-logconcave target distribution satisfying Talagrand\u2019s inequality T1. We first establish the convergence of the algorithm. Then, we establish a dimension-dependent complexity bound in terms of the Kernelized Stein Discrepancy (KSD). Unlike existing works, we do not assume that the KSD is bounded along the trajectory of the algorithm. Our approach relies on interpreting SVGD as a gradient descent over a space of probability measures.}\n}", "pdf": "https://proceedings.mlr.press/v162/salim22a/salim22a.pdf", "supp": "", "pdf_size": 296125, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2367099182954118616&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Microsoft Research, Redmond, USA; King Abdullah University of Science and Technology, Thuwal, Saudi Arabia; King Abdullah University of Science and Technology, Thuwal, Saudi Arabia", "aff_domain": "microsoft.com; ; ", "email": "microsoft.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/salim22a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Microsoft;King Abdullah University of Science and Technology", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.kast.kau.edu.sa", "aff_unique_abbr": "MSR;KAUST", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Redmond;Thuwal", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Saudi Arabia" }, { "title": "A Convergent and Dimension-Independent Min-Max Optimization Algorithm", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18203", "id": "18203", "proceeding": "https://proceedings.mlr.press/v162/keswani22a.html", "poster": "/media/PosterPDFs/ICML%202022/51311013e51adebc3c34d2cc591fefee_dzZIx3V.png?t=1658427325.8748791", "slides": "", "author_site": "Vijay Keswani, Oren Mangoubi, Sushant Sachdeva, Nisheeth K. Vishnoi", "author": "Vijay Keswani; Oren Mangoubi; Sushant Sachdeva; Nisheeth K. Vishnoi", "abstract": "We study a variant of a recently introduced min-max optimization framework where the max-player is constrained to update its parameters in a greedy manner until it reaches a first-order stationary point. Our equilibrium definition for this framework depends on a proposal distribution which the min-player uses to choose directions in which to update its parameters. We show that, given a smooth and bounded nonconvex-nonconcave objective function, access to any proposal distribution for the min-player\u2019s updates, and stochastic gradient oracle for the max-player, our algorithm converges to the aforementioned approximate local equilibrium in a number of iterations that does not depend on the dimension. The equilibrium point found by our algorithm depends on the proposal distribution, and when applying our algorithm to train GANs we choose the proposal distribution to be a distribution of stochastic gradients. We empirically evaluate our algorithm on challenging nonconvex-nonconcave test-functions and loss functions arising in GAN training. Our algorithm converges on these test functions and, when used to train GANs, trains stably on synthetic and real-world datasets and avoids mode collapse.", "bibtex": "@InProceedings{pmlr-v162-keswani22a,\n title = \t {A Convergent and Dimension-Independent Min-Max Optimization Algorithm},\n author = {Keswani, Vijay and Mangoubi, Oren and Sachdeva, Sushant and Vishnoi, Nisheeth K.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10939--10973},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/keswani22a/keswani22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/keswani22a.html},\n abstract = \t {We study a variant of a recently introduced min-max optimization framework where the max-player is constrained to update its parameters in a greedy manner until it reaches a first-order stationary point. Our equilibrium definition for this framework depends on a proposal distribution which the min-player uses to choose directions in which to update its parameters. We show that, given a smooth and bounded nonconvex-nonconcave objective function, access to any proposal distribution for the min-player\u2019s updates, and stochastic gradient oracle for the max-player, our algorithm converges to the aforementioned approximate local equilibrium in a number of iterations that does not depend on the dimension. The equilibrium point found by our algorithm depends on the proposal distribution, and when applying our algorithm to train GANs we choose the proposal distribution to be a distribution of stochastic gradients. We empirically evaluate our algorithm on challenging nonconvex-nonconcave test-functions and loss functions arising in GAN training. Our algorithm converges on these test functions and, when used to train GANs, trains stably on synthetic and real-world datasets and avoids mode collapse.}\n}", "pdf": "https://proceedings.mlr.press/v162/keswani22a/keswani22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/keswani22a-supp.zip", "pdf_size": 8683627, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17533849596525172458&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Statistics and Data Science, Yale University, US; Department of Mathematical Sciences, Worcester Polytechnic Institute, US; Department of Computer Science, University of Toronto, Canada; Department of Computer Science, Yale University, US", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/keswani22a.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Yale University;Worcester Polytechnic Institute;University of Toronto", "aff_unique_dep": "Department of Statistics and Data Science;Department of Mathematical Sciences;Department of Computer Science", "aff_unique_url": "https://www.yale.edu;https://www.wpi.edu;https://www.utoronto.ca", "aff_unique_abbr": "Yale;WPI;U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "A Deep Learning Approach for the Segmentation of Electroencephalography Data in Eye Tracking Applications", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17761", "id": "17761", "proceeding": "https://proceedings.mlr.press/v162/wolf22a.html", "poster": "/media/PosterPDFs/ICML%202022/d882050bb9eeba930974f596931be527.png?t=1657975260.6719964", "slides": "", "author_site": "Lukas Wolf, Ard Kastrati, Martyna Plomecka, Jieming Li, Dustin Klebe, Alexander Veicht, Roger Wattenhofer, Nicolas Langer", "author": "Lukas Wolf; Ard Kastrati; Martyna B Plomecka; Jie-Ming Li; Dustin Klebe; Alexander Veicht; Roger Wattenhofer; Nicolas Langer", "abstract": "The collection of eye gaze information provides a window into many critical aspects of human cognition, health and behaviour. Additionally, many neuroscientific studies complement the behavioural information gained from eye tracking with the high temporal resolution and neurophysiological markers provided by electroencephalography (EEG). One of the essential eye-tracking software processing steps is the segmentation of the continuous data stream into events relevant to eye-tracking applications, such as saccades, fixations, and blinks. Here, we introduce DETRtime, a novel framework for time-series segmentation that creates ocular event detectors that do not require additionally recorded eye-tracking modality and rely solely on EEG data. Our end-to-end deep-learning-based framework brings recent advances in Computer Vision to the forefront of the times series segmentation of EEG data. DETRtime achieves state-of-the-art performance in ocular event detection across diverse eye-tracking experiment paradigms. In addition to that, we provide evidence that our model generalizes well in the task of EEG sleep stage segmentation.", "bibtex": "@InProceedings{pmlr-v162-wolf22a,\n title = \t {A Deep Learning Approach for the Segmentation of Electroencephalography Data in Eye Tracking Applications},\n author = {Wolf, Lukas and Kastrati, Ard and Plomecka, Martyna B and Li, Jie-Ming and Klebe, Dustin and Veicht, Alexander and Wattenhofer, Roger and Langer, Nicolas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23912--23932},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wolf22a/wolf22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/wolf22a.html},\n abstract = \t {The collection of eye gaze information provides a window into many critical aspects of human cognition, health and behaviour. Additionally, many neuroscientific studies complement the behavioural information gained from eye tracking with the high temporal resolution and neurophysiological markers provided by electroencephalography (EEG). One of the essential eye-tracking software processing steps is the segmentation of the continuous data stream into events relevant to eye-tracking applications, such as saccades, fixations, and blinks. Here, we introduce DETRtime, a novel framework for time-series segmentation that creates ocular event detectors that do not require additionally recorded eye-tracking modality and rely solely on EEG data. Our end-to-end deep-learning-based framework brings recent advances in Computer Vision to the forefront of the times series segmentation of EEG data. DETRtime achieves state-of-the-art performance in ocular event detection across diverse eye-tracking experiment paradigms. In addition to that, we provide evidence that our model generalizes well in the task of EEG sleep stage segmentation.}\n}", "pdf": "https://proceedings.mlr.press/v162/wolf22a/wolf22a.pdf", "supp": "", "pdf_size": 7259740, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=561665774245262907&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "email": ";;;;;;;", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/wolf22a.html" }, { "title": "A Difference Standardization Method for Mutual Transfer Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17525", "id": "17525", "proceeding": "https://proceedings.mlr.press/v162/xu22j.html", "poster": "/media/PosterPDFs/ICML%202022/d72a7ed33514158ae5e68ed6d80177b9.png?t=1656078069.170649", "slides": "/media/icml-2022/Slides/17525_K8PmMPr.pdf", "author_site": "Haoqing Xu, Meng Wang, Beilun Wang", "author": "Haoqing Xu; Meng Wang; Beilun Wang", "abstract": "In many real-world applications, mutual transfer learning is the paradigm that each data domain can potentially be a source or target domain. This is quite different from transfer learning tasks where the source and target are known a priori. However, previous studies about mutual transfer learning either suffer from high computational complexity or oversimplified hypothesis. To overcome these challenges, in this paper, we propose the \\underline{Diff}erence \\underline{S}tandardization method ({\\bf DiffS}) for mutual transfer learning. Specifically, we put forward a novel distance metric between domains, the standardized domain difference, to obtain fast structure recovery and accurate parameter estimation simultaneously. We validate the method\u2019s performance using both synthetic and real-world data. Compared to previous methods, DiffS demonstrates a speed-up of approximately 3000 times that of similar methods and achieves the same accurate learnability structure estimation.", "bibtex": "@InProceedings{pmlr-v162-xu22j,\n title = \t {A Difference Standardization Method for Mutual Transfer Learning},\n author = {Xu, Haoqing and Wang, Meng and Wang, Beilun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24683--24697},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22j/xu22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22j.html},\n abstract = \t {In many real-world applications, mutual transfer learning is the paradigm that each data domain can potentially be a source or target domain. This is quite different from transfer learning tasks where the source and target are known a priori. However, previous studies about mutual transfer learning either suffer from high computational complexity or oversimplified hypothesis. To overcome these challenges, in this paper, we propose the \\underline{Diff}erence \\underline{S}tandardization method ({\\bf DiffS}) for mutual transfer learning. Specifically, we put forward a novel distance metric between domains, the standardized domain difference, to obtain fast structure recovery and accurate parameter estimation simultaneously. We validate the method\u2019s performance using both synthetic and real-world data. Compared to previous methods, DiffS demonstrates a speed-up of approximately 3000 times that of similar methods and achieves the same accurate learnability structure estimation.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22j/xu22j.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/xu22j-supp.zip", "pdf_size": 5382073, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11621787324386732159&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "School of Computer Science and Engineering, Southeast University, Nanjing 210096, China; School of Computer Science and Engineering, Southeast University, Nanjing 210096, China + Key Laboratory of Computer Network and Information Integration (Southeast University), Ministry of Education, China; School of Computer Science and Engineering, Southeast University, Nanjing 210096, China + Key Laboratory of Computer Network and Information Integration (Southeast University), Ministry of Education, China", "aff_domain": "seu.edu.cn;seu.edu.cn;seu.edu.cn", "email": "seu.edu.cn;seu.edu.cn;seu.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/xu22j.html", "aff_unique_index": "0;0+0;0+0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "School of Computer Science and Engineering", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Nanjing;", "aff_country_unique_index": "0;0+0;0+0", "aff_country_unique": "China" }, { "title": "A Differential Entropy Estimator for Training Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16325", "id": "16325", "proceeding": "https://proceedings.mlr.press/v162/pichler22a.html", "poster": "/media/PosterPDFs/ICML%202022/2d36b5821f8affc6868b59dfc9af6c9f.png?t=1657471992.0265052", "slides": "", "author_site": "Georg Pichler, Pierre Colombo, Malik Boudiaf, G\u00fcnther Koliander, Pablo Piantanida", "author": "Georg Pichler; Pierre Jean A. Colombo; Malik Boudiaf; G\u00fcnther Koliander; Pablo Piantanida", "abstract": "Mutual Information (MI) has been widely used as a loss regularizer for training neural networks. This has been particularly effective when learn disentangled or compressed representations of high dimensional data. However, differential entropy (DE), another fundamental measure of information, has not found widespread use in neural network training. Although DE offers a potentially wider range of applications than MI, off-the-shelf DE estimators are either non differentiable, computationally intractable or fail to adapt to changes in the underlying distribution. These drawbacks prevent them from being used as regularizers in neural networks training. To address shortcomings in previously proposed estimators for DE, here we introduce KNIFE, a fully parameterized, differentiable kernel-based estimator of DE. The flexibility of our approach also allows us to construct KNIFE-based estimators for conditional (on either discrete or continuous variables) DE, as well as MI. We empirically validate our method on high-dimensional synthetic data and further apply it to guide the training of neural networks for real-world tasks. Our experiments on a large variety of tasks, including visual domain adaptation, textual fair classification, and textual fine-tuning demonstrate the effectiveness of KNIFE-based estimation. Code can be found at https://github.com/g-pichler/knife.", "bibtex": "@InProceedings{pmlr-v162-pichler22a,\n title = \t {A Differential Entropy Estimator for Training Neural Networks},\n author = {Pichler, Georg and Colombo, Pierre Jean A. and Boudiaf, Malik and Koliander, G{\\\"u}nther and Piantanida, Pablo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17691--17715},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pichler22a/pichler22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pichler22a.html},\n abstract = \t {Mutual Information (MI) has been widely used as a loss regularizer for training neural networks. This has been particularly effective when learn disentangled or compressed representations of high dimensional data. However, differential entropy (DE), another fundamental measure of information, has not found widespread use in neural network training. Although DE offers a potentially wider range of applications than MI, off-the-shelf DE estimators are either non differentiable, computationally intractable or fail to adapt to changes in the underlying distribution. These drawbacks prevent them from being used as regularizers in neural networks training. To address shortcomings in previously proposed estimators for DE, here we introduce KNIFE, a fully parameterized, differentiable kernel-based estimator of DE. The flexibility of our approach also allows us to construct KNIFE-based estimators for conditional (on either discrete or continuous variables) DE, as well as MI. We empirically validate our method on high-dimensional synthetic data and further apply it to guide the training of neural networks for real-world tasks. Our experiments on a large variety of tasks, including visual domain adaptation, textual fair classification, and textual fine-tuning demonstrate the effectiveness of KNIFE-based estimation. Code can be found at https://github.com/g-pichler/knife.}\n}", "pdf": "https://proceedings.mlr.press/v162/pichler22a/pichler22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/pichler22a-supp.zip", "pdf_size": 2768325, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5856117255578319314&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "Institute of Telecommunications, TU Wien, 1040 Vienna, Austria+Laboratoire des Signaux et Syst\u00e8mes (L2S),Paris-Saclay CNRS CentraleSup\u00e9lec, 91190 Gif-sur-Yvette, France+\u00c9TS Montreal, Quebec H3C 1K3, Canada; Laboratoire des Signaux et Syst\u00e8mes (L2S),Paris-Saclay CNRS CentraleSup\u00e9lec, 91190 Gif-sur-Yvette, France+\u00c9TS Montreal, Quebec H3C 1K3, Canada; \u00c9TS Montreal, Quebec H3C 1K3, Canada; Acoustics Research Institute, Austrian Academy of Sciences, 1040, Vienna, Austria; International Laboratory on Learning Systems (ILLS),Universit\u00e9 McGill - ETS - MILA - CNRS - Universit\u00e9 Paris-Saclay - CentraleSup\u00e9lec , Montreal, Quebec, Canada", "aff_domain": "ieee.org; ; ; ; ", "email": "ieee.org; ; ; ; ", "github": "https://github.com/g-pichler/knife", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/pichler22a.html", "aff_unique_index": "0+1+2;1+2;2;3;4", "aff_unique_norm": "TU Wien;CentraleSup\u00e9lec;\u00c9cole de technologie sup\u00e9rieure;Austrian Academy of Sciences;Universit\u00e9 McGill", "aff_unique_dep": "Institute of Telecommunications;Laboratoire des Signaux et Syst\u00e8mes (L2S);;Acoustics Research Institute;International Laboratory on Learning Systems (ILLS)", "aff_unique_url": "https://www.tuwien.ac.at;https://www.centralesupelec.fr;https://www.etsmtl.ca;https://www.oeaw.ac.at;https://www.mcgill.ca", "aff_unique_abbr": "TU Wien;CentraleSup\u00e9lec;\u00c9TS;OEAW;McGill", "aff_campus_unique_index": "0+1+2;1+2;2;0;2", "aff_campus_unique": "Vienna;Paris-Saclay;Montreal", "aff_country_unique_index": "0+1+2;1+2;2;0;2", "aff_country_unique": "Austria;France;Canada" }, { "title": "A Dynamical System Perspective for Lipschitz Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17625", "id": "17625", "proceeding": "https://proceedings.mlr.press/v162/meunier22a.html", "poster": "/media/PosterPDFs/ICML%202022/01064f1de9dfcd9d77b14d11beefefd4.png?t=1657705859.0055492", "slides": "", "author_site": "Laurent Meunier, Blaise Delattre, Alexandre ARAUJO, Alexandre Allauzen", "author": "Laurent Meunier; Blaise J Delattre; Alexandre Araujo; Alexandre Allauzen", "abstract": "The Lipschitz constant of neural networks has been established as a key quantity to enforce the robustness to adversarial examples. In this paper, we tackle the problem of building $1$-Lipschitz Neural Networks. By studying Residual Networks from a continuous time dynamical system perspective, we provide a generic method to build $1$-Lipschitz Neural Networks and show that some previous approaches are special cases of this framework. Then, we extend this reasoning and show that ResNet flows derived from convex potentials define $1$-Lipschitz transformations, that lead us to define the", "bibtex": "@InProceedings{pmlr-v162-meunier22a,\n title = \t {A Dynamical System Perspective for {L}ipschitz Neural Networks},\n author = {Meunier, Laurent and Delattre, Blaise J and Araujo, Alexandre and Allauzen, Alexandre},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15484--15500},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/meunier22a/meunier22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/meunier22a.html},\n abstract = \t {The Lipschitz constant of neural networks has been established as a key quantity to enforce the robustness to adversarial examples. In this paper, we tackle the problem of building $1$-Lipschitz Neural Networks. By studying Residual Networks from a continuous time dynamical system perspective, we provide a generic method to build $1$-Lipschitz Neural Networks and show that some previous approaches are special cases of this framework. Then, we extend this reasoning and show that ResNet flows derived from convex potentials define $1$-Lipschitz transformations, that lead us to define the", "pdf": "https://proceedings.mlr.press/v162/meunier22a/meunier22a.pdf", "supp": "", "pdf_size": 485741, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3492129744304387484&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Miles Team, LAMSADE, Universit\u00e9 Paris-Dauphine, PSL University, Paris, France+Meta AI Research, Paris, France; Miles Team, LAMSADE, Universit\u00e9 Paris-Dauphine, PSL University, Paris, France+Foxstream, Lyon, France; INRIA, Ecole Normale Sup\u00e9rieure, CNRS, PSL University, Paris, France; Miles Team, LAMSADE, Universit\u00e9 Paris-Dauphine, PSL University, Paris, France+ESPCI, Paris, France", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "https://github.com/MILES-PSL/Convex-Potential-Layer", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/meunier22a.html", "aff_unique_index": "0+1;0+2;3;0+4", "aff_unique_norm": "Universit\u00e9 Paris-Dauphine;Meta;Foxstream;INRIA;Ecole Sup\u00e9rieure de Physique et de Chimie Industrielles", "aff_unique_dep": "LAMSADE;AI Research;;;", "aff_unique_url": "https://www.univ-paris-dauphine.fr;https://research.facebook.com;;https://www.inria.fr;https://www.espci.fr", "aff_unique_abbr": "UPD;Meta AI;;INRIA;ESPCI", "aff_campus_unique_index": "0+0;0;0;0+0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0+0;0+0;0;0+0", "aff_country_unique": "France" }, { "title": "A Framework for Learning to Request Rich and Contextually Useful Information from Humans", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16819", "id": "16819", "proceeding": "https://proceedings.mlr.press/v162/nguyen22a.html", "poster": "/media/PosterPDFs/ICML%202022/4c4ea5258ef3fb3fb1fc48fee9b4408c.png?t=1657488357.3106043", "slides": "", "author_site": "Khanh Nguyen, Yonatan Bisk, Hal Daum\u00e9 III", "author": "Khanh X Nguyen; Yonatan Bisk; Hal Daum\u00e9 Iii", "abstract": "When deployed, AI agents will encounter problems that are beyond their autonomous problem-solving capabilities. Leveraging human assistance can help agents overcome their inherent limitations and robustly cope with unfamiliar situations. We present a general interactive framework that enables an agent to request and interpret rich, contextually useful information from an assistant that has knowledge about the task and the environment. We demonstrate the practicality of our framework on a simulated human-assisted navigation problem. Aided with an assistance-requesting policy learned by our method, a navigation agent achieves up to a 7{\\texttimes} improvement in success rate on tasks that take place in previously unseen environments, compared to fully autonomous behavior. We show that the agent can take advantage of different types of information depending on the context, and analyze the benefits and challenges of learning the assistance-requesting policy when the assistant can recursively decompose tasks into subtasks.", "bibtex": "@InProceedings{pmlr-v162-nguyen22a,\n title = \t {A Framework for Learning to Request Rich and Contextually Useful Information from Humans},\n author = {Nguyen, Khanh X and Bisk, Yonatan and Iii, Hal Daum{\\'e}},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16553--16568},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nguyen22a/nguyen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nguyen22a.html},\n abstract = \t {When deployed, AI agents will encounter problems that are beyond their autonomous problem-solving capabilities. Leveraging human assistance can help agents overcome their inherent limitations and robustly cope with unfamiliar situations. We present a general interactive framework that enables an agent to request and interpret rich, contextually useful information from an assistant that has knowledge about the task and the environment. We demonstrate the practicality of our framework on a simulated human-assisted navigation problem. Aided with an assistance-requesting policy learned by our method, a navigation agent achieves up to a 7{\\texttimes} improvement in success rate on tasks that take place in previously unseen environments, compared to fully autonomous behavior. We show that the agent can take advantage of different types of information depending on the context, and analyze the benefits and challenges of learning the assistance-requesting policy when the assistant can recursively decompose tasks into subtasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/nguyen22a/nguyen22a.pdf", "supp": "", "pdf_size": 1228156, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4685371682242943062&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of Maryland, College Park; Carnegie Mellon University; Microsoft Research", "aff_domain": "umd.edu; ; ", "email": "umd.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/nguyen22a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Maryland;Carnegie Mellon University;Microsoft", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www/umd.edu;https://www.cmu.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UMD;CMU;MSR", "aff_campus_unique_index": "0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "A Functional Information Perspective on Model Interpretation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17195", "id": "17195", "proceeding": "https://proceedings.mlr.press/v162/gat22a.html", "poster": "/media/PosterPDFs/ICML%202022/9afefc52942cb83c7c1f14b2139b09ba.png?t=1656440018.829339", "slides": "", "author_site": "Itai Gat, Nitay Calderon, Roi Reichart, Tamir Hazan", "author": "Itai Gat; Nitay Calderon; Roi Reichart; Tamir Hazan", "abstract": "Contemporary predictive models are hard to interpret as their deep nets exploit numerous complex relations between input elements. This work suggests a theoretical framework for model interpretability by measuring the contribution of relevant features to the functional entropy of the network with respect to the input. We rely on the log-Sobolev inequality that bounds the functional entropy by the functional Fisher information with respect to the covariance of the data. This provides a principled way to measure the amount of information contribution of a subset of features to the decision function. Through extensive experiments, we show that our method surpasses existing interpretability sampling-based methods on various data signals such as image, text, and audio.", "bibtex": "@InProceedings{pmlr-v162-gat22a,\n title = \t {A Functional Information Perspective on Model Interpretation},\n author = {Gat, Itai and Calderon, Nitay and Reichart, Roi and Hazan, Tamir},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7266--7278},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gat22a/gat22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gat22a.html},\n abstract = \t {Contemporary predictive models are hard to interpret as their deep nets exploit numerous complex relations between input elements. This work suggests a theoretical framework for model interpretability by measuring the contribution of relevant features to the functional entropy of the network with respect to the input. We rely on the log-Sobolev inequality that bounds the functional entropy by the functional Fisher information with respect to the covariance of the data. This provides a principled way to measure the amount of information contribution of a subset of features to the decision function. Through extensive experiments, we show that our method surpasses existing interpretability sampling-based methods on various data signals such as image, text, and audio.}\n}", "pdf": "https://proceedings.mlr.press/v162/gat22a/gat22a.pdf", "supp": "", "pdf_size": 3652697, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5647868257497386951&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Technion - Israel Institute of Technology; Technion - Israel Institute of Technology; Technion - Israel Institute of Technology; Technion - Israel Institute of Technology", "aff_domain": "technion.ac.il; ; ; ", "email": "technion.ac.il; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/gat22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Israel" }, { "title": "A General Recipe for Likelihood-free Bayesian Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16467", "id": "16467", "proceeding": "https://proceedings.mlr.press/v162/song22b.html", "poster": "/media/PosterPDFs/ICML%202022/4a4526b1ec301744aba9526d78fcb2a6.png?t=1657731522.8941777", "slides": "", "author_site": "Jiaming Song, Lantao Yu, Willie Neiswanger, Stefano Ermon", "author": "Jiaming Song; Lantao Yu; Willie Neiswanger; Stefano Ermon", "abstract": "The acquisition function, a critical component in Bayesian optimization (BO), can often be written as the expectation of a utility function under a surrogate model. However, to ensure that acquisition functions are tractable to optimize, restrictions must be placed on the surrogate model and utility function. To extend BO to a broader class of models and utilities, we propose likelihood-free BO (LFBO), an approach based on likelihood-free inference. LFBO directly models the acquisition function without having to separately perform inference with a probabilistic surrogate model. We show that computing the acquisition function in LFBO can be reduced to optimizing a weighted classification problem, which extends an existing likelihood-free density ratio estimation method related to probability of improvement (PI). By choosing the utility function for expected improvement (EI), LFBO outperforms the aforementioned method, as well as various state-of-the-art black-box optimization methods on several real-world optimization problems. LFBO can also leverage composite structures of the objective function, which further improves its regret by several orders of magnitude.", "bibtex": "@InProceedings{pmlr-v162-song22b,\n title = \t {A General Recipe for Likelihood-free {B}ayesian Optimization},\n author = {Song, Jiaming and Yu, Lantao and Neiswanger, Willie and Ermon, Stefano},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20384--20404},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/song22b/song22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/song22b.html},\n abstract = \t {The acquisition function, a critical component in Bayesian optimization (BO), can often be written as the expectation of a utility function under a surrogate model. However, to ensure that acquisition functions are tractable to optimize, restrictions must be placed on the surrogate model and utility function. To extend BO to a broader class of models and utilities, we propose likelihood-free BO (LFBO), an approach based on likelihood-free inference. LFBO directly models the acquisition function without having to separately perform inference with a probabilistic surrogate model. We show that computing the acquisition function in LFBO can be reduced to optimizing a weighted classification problem, which extends an existing likelihood-free density ratio estimation method related to probability of improvement (PI). By choosing the utility function for expected improvement (EI), LFBO outperforms the aforementioned method, as well as various state-of-the-art black-box optimization methods on several real-world optimization problems. LFBO can also leverage composite structures of the objective function, which further improves its regret by several orders of magnitude.}\n}", "pdf": "https://proceedings.mlr.press/v162/song22b/song22b.pdf", "supp": "", "pdf_size": 1112377, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2199690906597156790&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "NVIDIA (Work done while at Stanford); Stanford University; Stanford University; Stanford University", "aff_domain": "nvidia.com; ; ; ", "email": "nvidia.com; ; ; ", "github": "", "project": "https://lfbo-ml.github.io/", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/song22b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Hierarchical Bayesian Approach to Inverse Reinforcement Learning with Symbolic Reward Machines", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17421", "id": "17421", "proceeding": "https://proceedings.mlr.press/v162/zhou22b.html", "poster": "/media/PosterPDFs/ICML%202022/dff8e9c2ac33381546d96deea9922999.png?t=1657525056.7668982", "slides": "", "author_site": "Weichao Zhou, Wenchao Li", "author": "Weichao Zhou; Wenchao Li", "abstract": "A misspecified reward can degrade sample efficiency and induce undesired behaviors in reinforcement learning (RL) problems. We propose symbolic reward machines for incorporating high-level task knowledge when specifying the reward signals. Symbolic reward machines augment existing reward machine formalism by allowing transitions to carry predicates and symbolic reward outputs. This formalism lends itself well to inverse reinforcement learning, whereby the key challenge is determining appropriate assignments to the symbolic values from a few expert demonstrations. We propose a hierarchical Bayesian approach for inferring the most likely assignments such that the concretized reward machine can discriminate expert demonstrated trajectories from other trajectories with high accuracy. Experimental results show that learned reward machines can significantly improve training efficiency for complex RL tasks and generalize well across different task environment configurations.", "bibtex": "@InProceedings{pmlr-v162-zhou22b,\n title = \t {A Hierarchical {B}ayesian Approach to Inverse Reinforcement Learning with Symbolic Reward Machines},\n author = {Zhou, Weichao and Li, Wenchao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27159--27178},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22b/zhou22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22b.html},\n abstract = \t {A misspecified reward can degrade sample efficiency and induce undesired behaviors in reinforcement learning (RL) problems. We propose symbolic reward machines for incorporating high-level task knowledge when specifying the reward signals. Symbolic reward machines augment existing reward machine formalism by allowing transitions to carry predicates and symbolic reward outputs. This formalism lends itself well to inverse reinforcement learning, whereby the key challenge is determining appropriate assignments to the symbolic values from a few expert demonstrations. We propose a hierarchical Bayesian approach for inferring the most likely assignments such that the concretized reward machine can discriminate expert demonstrated trajectories from other trajectories with high accuracy. Experimental results show that learned reward machines can significantly improve training efficiency for complex RL tasks and generalize well across different task environment configurations.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22b/zhou22b.pdf", "supp": "", "pdf_size": 4278620, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13941613076032869632&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of ECE, Boston University; Department of ECE, Boston University", "aff_domain": "bu.edu; ", "email": "bu.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/zhou22b.html", "aff_unique_index": "0;0", "aff_unique_norm": "Boston University", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Hierarchical Transitive-Aligned Graph Kernel for Un-attributed Graphs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16245", "id": "16245", "proceeding": "https://proceedings.mlr.press/v162/bai22a.html", "poster": "/media/PosterPDFs/ICML%202022/0d7363894acdee742caf7fe4e97c4d49.png?t=1657424574.761766", "slides": "", "author_site": "Lu Bai, Lixin Cui, Edwin Hancock", "author": "Lu Bai; Lixin Cui; Hancock Edwin", "abstract": "In this paper, we develop a new graph kernel, namely the Hierarchical Transitive-Aligned Kernel, by transitively aligning the vertices between graphs through a family of hierarchical prototype graphs. Comparing to most existing state-of-the-art graph kernels, the proposed kernel has three theoretical advantages. First, it incorporates the locational correspondence information between graphs into the kernel computation, and thus overcomes the shortcoming of ignoring structural correspondences arising in most R-convolution kernels. Second, it guarantees the transitivity between the correspondence information that is not available for most existing matching kernels. Third, it incorporates the information of all graphs under comparisons into the kernel computation process, and thus encapsulates richer characteristics. Experimental evaluations demonstrate the effectiveness of the new transitive-aligned kernel.", "bibtex": "@InProceedings{pmlr-v162-bai22a,\n title = \t {A Hierarchical Transitive-Aligned Graph Kernel for Un-attributed Graphs},\n author = {Bai, Lu and Cui, Lixin and Edwin, Hancock},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1327--1336},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bai22a/bai22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bai22a.html},\n abstract = \t {In this paper, we develop a new graph kernel, namely the Hierarchical Transitive-Aligned Kernel, by transitively aligning the vertices between graphs through a family of hierarchical prototype graphs. Comparing to most existing state-of-the-art graph kernels, the proposed kernel has three theoretical advantages. First, it incorporates the locational correspondence information between graphs into the kernel computation, and thus overcomes the shortcoming of ignoring structural correspondences arising in most R-convolution kernels. Second, it guarantees the transitivity between the correspondence information that is not available for most existing matching kernels. Third, it incorporates the information of all graphs under comparisons into the kernel computation process, and thus encapsulates richer characteristics. Experimental evaluations demonstrate the effectiveness of the new transitive-aligned kernel.}\n}", "pdf": "https://proceedings.mlr.press/v162/bai22a/bai22a.pdf", "supp": "", "pdf_size": 2010999, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3654232182177250218&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "School of Artificial Intelligence, Beijing Normal University, Beijing, China+Central University of Finance and Economics, Beijing, China; Central University of Finance and Economics, Beijing, China; Department of Computer Science, University of York, York, UK", "aff_domain": "cufe.edu.cn;cufe.edu.cn; ", "email": "cufe.edu.cn;cufe.edu.cn; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/bai22a.html", "aff_unique_index": "0+1;1;2", "aff_unique_norm": "Beijing Normal University;Central University of Finance and Economics;University of York", "aff_unique_dep": "School of Artificial Intelligence;;Department of Computer Science", "aff_unique_url": "https://www.bnu.edu.cn;http://www.cufe.edu.cn;https://www.york.ac.uk", "aff_unique_abbr": "BNU;CUFE;York", "aff_campus_unique_index": "0+0;0;1", "aff_campus_unique": "Beijing;York", "aff_country_unique_index": "0+0;0;1", "aff_country_unique": "China;United Kingdom" }, { "title": "A Joint Exponential Mechanism For Differentially Private Top-$k$", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16103", "id": "16103", "proceeding": "https://proceedings.mlr.press/v162/gillenwater22a.html", "poster": "/media/PosterPDFs/ICML%202022/68d30a9594728bc39aa24be94b319d21.png?t=1658144612.4291072", "slides": "", "author_site": "Jennifer Gillenwater, Matthew Joseph, andres munoz, Monica Ribero Diaz", "author": "Jennifer Gillenwater; Matthew Joseph; Andres Munoz; Monica Ribero Diaz", "abstract": "We present a differentially private algorithm for releasing the sequence of $k$ elements with the highest counts from a data domain of $d$ elements. The algorithm is a \"joint\" instance of the exponential mechanism, and its output space consists of all $O(d^k)$ length-$k$ sequences. Our main contribution is a method to sample this exponential mechanism in time $O(dk\\log(k) + d\\log(d))$ and space $O(dk)$. Experiments show that this approach outperforms existing pure differential privacy methods and improves upon even approximate differential privacy methods for moderate $k$.", "bibtex": "@InProceedings{pmlr-v162-gillenwater22a,\n title = \t {A Joint Exponential Mechanism For Differentially Private Top-$k$},\n author = {Gillenwater, Jennifer and Joseph, Matthew and Munoz, Andres and Diaz, Monica Ribero},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7570--7582},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gillenwater22a/gillenwater22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gillenwater22a.html},\n abstract = \t {We present a differentially private algorithm for releasing the sequence of $k$ elements with the highest counts from a data domain of $d$ elements. The algorithm is a \"joint\" instance of the exponential mechanism, and its output space consists of all $O(d^k)$ length-$k$ sequences. Our main contribution is a method to sample this exponential mechanism in time $O(dk\\log(k) + d\\log(d))$ and space $O(dk)$. Experiments show that this approach outperforms existing pure differential privacy methods and improves upon even approximate differential privacy methods for moderate $k$.}\n}", "pdf": "https://proceedings.mlr.press/v162/gillenwater22a/gillenwater22a.pdf", "supp": "", "pdf_size": 1033788, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17002847930717391794&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Google Research NYC; Google Research NYC; Google Research NYC; UT Austin", "aff_domain": "google.com;google.com;google.com;utexas.edu", "email": "google.com;google.com;google.com;utexas.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/gillenwater22a.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Google;University of Texas at Austin", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://www.utexas.edu", "aff_unique_abbr": "Google Research;UT Austin", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "New York City;Austin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Langevin-like Sampler for Discrete Distributions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17045", "id": "17045", "proceeding": "https://proceedings.mlr.press/v162/zhang22t.html", "poster": "/media/PosterPDFs/ICML%202022/ad5ab36761669d6eadbaee691c4a1d22.png?t=1658120746.6552093", "slides": "", "author_site": "Ruqi Zhang, Xingchao Liu, Qiang Liu", "author": "Ruqi Zhang; Xingchao Liu; Qiang Liu", "abstract": "We propose discrete Langevin proposal (DLP), a simple and scalable gradient-based proposal for sampling complex high-dimensional discrete distributions. In contrast to Gibbs sampling-based methods, DLP is able to update all coordinates in parallel in a single step and the magnitude of changes is controlled by a stepsize. This allows a cheap and efficient exploration in the space of high-dimensional and strongly correlated variables. We prove the efficiency of DLP by showing that the asymptotic bias of its stationary distribution is zero for log-quadratic distributions, and is small for distributions that are close to being log-quadratic. With DLP, we develop several variants of sampling algorithms, including unadjusted, Metropolis-adjusted, stochastic and preconditioned versions. DLP outperforms many popular alternatives on a wide variety of tasks, including Ising models, restricted Boltzmann machines, deep energy-based models, binary neural networks and language generation.", "bibtex": "@InProceedings{pmlr-v162-zhang22t,\n title = \t {A {L}angevin-like Sampler for Discrete Distributions},\n author = {Zhang, Ruqi and Liu, Xingchao and Liu, Qiang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26375--26396},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22t/zhang22t.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22t.html},\n abstract = \t {We propose discrete Langevin proposal (DLP), a simple and scalable gradient-based proposal for sampling complex high-dimensional discrete distributions. In contrast to Gibbs sampling-based methods, DLP is able to update all coordinates in parallel in a single step and the magnitude of changes is controlled by a stepsize. This allows a cheap and efficient exploration in the space of high-dimensional and strongly correlated variables. We prove the efficiency of DLP by showing that the asymptotic bias of its stationary distribution is zero for log-quadratic distributions, and is small for distributions that are close to being log-quadratic. With DLP, we develop several variants of sampling algorithms, including unadjusted, Metropolis-adjusted, stochastic and preconditioned versions. DLP outperforms many popular alternatives on a wide variety of tasks, including Ising models, restricted Boltzmann machines, deep energy-based models, binary neural networks and language generation.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22t/zhang22t.pdf", "supp": "", "pdf_size": 1185133, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3541239242626478838&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "The University of Texas at Austin; The University of Texas at Austin; The University of Texas at Austin", "aff_domain": "utexas.edu; ; ", "email": "utexas.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhang22t.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "A Marriage between Adversarial Team Games and 2-player Games: Enabling Abstractions, No-regret Learning, and Subgame Solving", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17641", "id": "17641", "proceeding": "https://proceedings.mlr.press/v162/carminati22a.html", "poster": "/media/PosterPDFs/ICML%202022/cacbf64b8a464fa1974da1eb0aa92851_QrzuKuF.png?t=1657838645.096116", "slides": "", "author_site": "Luca Carminati, Federico Cacciamani, Marco Ciccone, Nicola Gatti", "author": "Luca Carminati; Federico Cacciamani; Marco Ciccone; Nicola Gatti", "abstract": "", "bibtex": "@InProceedings{pmlr-v162-carminati22a,\n title = \t {A Marriage between Adversarial Team Games and 2-player Games: Enabling Abstractions, No-regret Learning, and Subgame Solving},\n author = {Carminati, Luca and Cacciamani, Federico and Ciccone, Marco and Gatti, Nicola},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2638--2657},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/carminati22a/carminati22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/carminati22a.html},\n abstract = \t {", "pdf": "https://proceedings.mlr.press/v162/carminati22a/carminati22a.pdf", "supp": "", "pdf_size": 582151, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=367374553743071687&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Politecnico di Milano; Politecnico di Milano; Politecnico di Torino; Politecnico di Milano", "aff_domain": "polimi.it; ; ; ", "email": "polimi.it; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/carminati22a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Politecnico di Milano;Politecnico di Torino", "aff_unique_dep": ";", "aff_unique_url": "https://www.polimi.it;https://www.polito.it", "aff_unique_abbr": "Polimi;Polito", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Italy" }, { "title": "A Minimax Learning Approach to Off-Policy Evaluation in Confounded Partially Observable Markov Decision Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16359", "id": "16359", "proceeding": "https://proceedings.mlr.press/v162/shi22f.html", "poster": "", "slides": "", "author_site": "Chengchun Shi, Masatoshi Uehara, Jiawei Huang, Nan Jiang", "author": "Chengchun Shi; Masatoshi Uehara; Jiawei Huang; Nan Jiang", "abstract": "We consider off-policy evaluation (OPE) in Partially Observable Markov Decision Processes (POMDPs), where the evaluation policy depends only on observable variables and the behavior policy depends on unobservable latent variables. Existing works either assume no unmeasured confounders, or focus on settings where both the observation and the state spaces are tabular. In this work, we first propose novel identification methods for OPE in POMDPs with latent confounders, by introducing bridge functions that link the target policy\u2019s value and the observed data distribution. We next propose minimax estimation methods for learning these bridge functions, and construct three estimators based on these estimated bridge functions, corresponding to a value function-based estimator, a marginalized importance sampling estimator, and a doubly-robust estimator. Our proposal permits general function approximation and is thus applicable to settings with continuous or large observation/state spaces. The nonasymptotic and asymptotic properties of the proposed estimators are investigated in detail. A Python implementation of our proposal is available at https://github.com/jiaweihhuang/ Confounded-POMDP-Exp.", "bibtex": "@InProceedings{pmlr-v162-shi22f,\n title = \t {A Minimax Learning Approach to Off-Policy Evaluation in Confounded Partially Observable {M}arkov Decision Processes},\n author = {Shi, Chengchun and Uehara, Masatoshi and Huang, Jiawei and Jiang, Nan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20057--20094},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shi22f/shi22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/shi22f.html},\n abstract = \t {We consider off-policy evaluation (OPE) in Partially Observable Markov Decision Processes (POMDPs), where the evaluation policy depends only on observable variables and the behavior policy depends on unobservable latent variables. Existing works either assume no unmeasured confounders, or focus on settings where both the observation and the state spaces are tabular. In this work, we first propose novel identification methods for OPE in POMDPs with latent confounders, by introducing bridge functions that link the target policy\u2019s value and the observed data distribution. We next propose minimax estimation methods for learning these bridge functions, and construct three estimators based on these estimated bridge functions, corresponding to a value function-based estimator, a marginalized importance sampling estimator, and a doubly-robust estimator. Our proposal permits general function approximation and is thus applicable to settings with continuous or large observation/state spaces. The nonasymptotic and asymptotic properties of the proposed estimators are investigated in detail. A Python implementation of our proposal is available at https://github.com/jiaweihhuang/ Confounded-POMDP-Exp.}\n}", "pdf": "https://proceedings.mlr.press/v162/shi22f/shi22f.pdf", "supp": "", "pdf_size": 1061753, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14872376674032871803&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "https://github.com/jiaweihhuang/Confounded-POMDP-Exp", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/shi22f.html" }, { "title": "A Model-Agnostic Randomized Learning Framework based on Random Hypothesis Subspace Sampling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16855", "id": "16855", "proceeding": "https://proceedings.mlr.press/v162/cao22a.html", "poster": "/media/PosterPDFs/ICML%202022/731ae30af8750c2d28720ea3c1f8c2b1.png?t=1657723937.9897854", "slides": "", "author_site": "Yiting Cao, Chao Lan", "author": "Yiting Cao; Chao Lan", "abstract": "We propose a model-agnostic randomized learning framework based on Random Hypothesis Subspace Sampling (RHSS). Given any hypothesis class, it randomly samples $k$ hypotheses and learns a near-optimal model from their span by simply solving a linear least square problem in $O(n k^2)$ time, where $n$ is the number of training instances. On the theory side, we derive the performance guarantee of RHSS from a generic subspace approximation perspective, leveraging properties of metric entropy and random matrices. On the practical side, we apply the RHSS framework to learn kernel, network and tree based models. Experimental results show they converge efficiently as $k$ increases and outperform their model-specific counterparts including random Fourier feature, random vector functional link and extra tree on real-world data sets.", "bibtex": "@InProceedings{pmlr-v162-cao22a,\n title = \t {A Model-Agnostic Randomized Learning Framework based on Random Hypothesis Subspace Sampling},\n author = {Cao, Yiting and Lan, Chao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2597--2608},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cao22a/cao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/cao22a.html},\n abstract = \t {We propose a model-agnostic randomized learning framework based on Random Hypothesis Subspace Sampling (RHSS). Given any hypothesis class, it randomly samples $k$ hypotheses and learns a near-optimal model from their span by simply solving a linear least square problem in $O(n k^2)$ time, where $n$ is the number of training instances. On the theory side, we derive the performance guarantee of RHSS from a generic subspace approximation perspective, leveraging properties of metric entropy and random matrices. On the practical side, we apply the RHSS framework to learn kernel, network and tree based models. Experimental results show they converge efficiently as $k$ increases and outperform their model-specific counterparts including random Fourier feature, random vector functional link and extra tree on real-world data sets.}\n}", "pdf": "https://proceedings.mlr.press/v162/cao22a/cao22a.pdf", "supp": "", "pdf_size": 722187, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3398850635091285713&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": "School of Computer Science, University of Oklahoma, Norman, OK, USA; School of Computer Science, University of Oklahoma, Norman, OK, USA", "aff_domain": "ou.edu;ou.edu", "email": "ou.edu;ou.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/cao22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Oklahoma", "aff_unique_dep": "School of Computer Science", "aff_unique_url": "https://www.ou.edu", "aff_unique_abbr": "OU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Norman", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Modern Self-Referential Weight Matrix That Learns to Modify Itself", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17699", "id": "17699", "proceeding": "https://proceedings.mlr.press/v162/irie22b.html", "poster": "/media/PosterPDFs/ICML%202022/5c971edc0c2cc92fc99b5a3609450cb7.png?t=1657790261.3850436", "slides": "", "author_site": "Kazuki Irie, Imanol Schlag, Robert Cordas, J\u00fcrgen Schmidhuber", "author": "Kazuki Irie; Imanol Schlag; R\u00f3bert Csord\u00e1s; J\u00fcrgen Schmidhuber", "abstract": "The weight matrix (WM) of a neural network (NN) is its program. The programs of many traditional NNs are learned through gradient descent in some error function, then remain fixed. The WM of a self-referential NN, however, can keep rapidly modifying all of itself during runtime. In principle, such NNs can meta-learn to learn, and meta-meta-learn to meta-learn to learn, and so on, in the sense of recursive self-improvement. While NN architectures potentially capable of implementing such behaviour have been proposed since the \u201990s, there have been few if any practical studies. Here we revisit such NNs, building upon recent successes of fast weight programmers and closely related linear Transformers. We propose a scalable self-referential WM (SRWM) that learns to use outer products and the delta update rule to modify itself. We evaluate our SRWM in supervised few-shot learning and in multi-task reinforcement learning with procedurally generated game environments. Our experiments demonstrate both practical applicability and competitive performance of the proposed SRWM. Our code is public.", "bibtex": "@InProceedings{pmlr-v162-irie22b,\n title = \t {A Modern Self-Referential Weight Matrix That Learns to Modify Itself},\n author = {Irie, Kazuki and Schlag, Imanol and Csord{\\'a}s, R{\\'o}bert and Schmidhuber, J{\\\"u}rgen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9660--9677},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/irie22b/irie22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/irie22b.html},\n abstract = \t {The weight matrix (WM) of a neural network (NN) is its program. The programs of many traditional NNs are learned through gradient descent in some error function, then remain fixed. The WM of a self-referential NN, however, can keep rapidly modifying all of itself during runtime. In principle, such NNs can meta-learn to learn, and meta-meta-learn to meta-learn to learn, and so on, in the sense of recursive self-improvement. While NN architectures potentially capable of implementing such behaviour have been proposed since the \u201990s, there have been few if any practical studies. Here we revisit such NNs, building upon recent successes of fast weight programmers and closely related linear Transformers. We propose a scalable self-referential WM (SRWM) that learns to use outer products and the delta update rule to modify itself. We evaluate our SRWM in supervised few-shot learning and in multi-task reinforcement learning with procedurally generated game environments. Our experiments demonstrate both practical applicability and competitive performance of the proposed SRWM. Our code is public.}\n}", "pdf": "https://proceedings.mlr.press/v162/irie22b/irie22b.pdf", "supp": "", "pdf_size": 1293230, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10630456414832460528&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "The Swiss AI Lab, IDSIA, USI & SUPSI, Lugano, Switzerland+AI Initiative, King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia; The Swiss AI Lab, IDSIA, USI & SUPSI, Lugano, Switzerland+AI Initiative, King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia; The Swiss AI Lab, IDSIA, USI & SUPSI, Lugano, Switzerland+AI Initiative, King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia; The Swiss AI Lab, IDSIA, USI & SUPSI, Lugano, Switzerland+AI Initiative, King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia", "aff_domain": "idsia.ch;idsia.ch;idsia.ch;idsia.ch", "email": "idsia.ch;idsia.ch;idsia.ch;idsia.ch", "github": "https://github.com/IDSIA/modern-srwm", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/irie22b.html", "aff_unique_index": "0+1;0+1;0+1;0+1", "aff_unique_norm": "Swiss AI Lab IDSIA;King Abdullah University of Science and Technology", "aff_unique_dep": "AI Lab;AI Initiative", "aff_unique_url": "https://www.idsia.ch/;https://www.kaust.edu.sa", "aff_unique_abbr": "IDSIA;KAUST", "aff_campus_unique_index": "0+1;0+1;0+1;0+1", "aff_campus_unique": "Lugano;Thuwal", "aff_country_unique_index": "0+1;0+1;0+1;0+1", "aff_country_unique": "Switzerland;Saudi Arabia" }, { "title": "A Multi-objective / Multi-task Learning Framework Induced by Pareto Stationarity", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17627", "id": "17627", "proceeding": "https://proceedings.mlr.press/v162/momma22a.html", "poster": "/media/PosterPDFs/ICML%202022/67ff32d40fb51f1a2fd2c4f1b1019785.png?t=1657613730.1484475", "slides": "", "author_site": "Michinari Momma, Chaosheng Dong, Jia Liu", "author": "Michinari Momma; Chaosheng Dong; Jia Liu", "abstract": "Multi-objective optimization (MOO) and multi-task learning (MTL) have gained much popularity with prevalent use cases such as production model development of regression / classification / ranking models with MOO, and training deep learning models with MTL. Despite the long history of research in MOO, its application to machine learning requires development of solution strategy, and algorithms have recently been developed to solve specific problems such as discovery of any Pareto optimal (PO) solution, and that with a particular form of preference. In this paper, we develop a novel and generic framework to discover a PO solution with multiple forms of preferences. It allows us to formulate a generic MOO / MTL problem to express a preference, which is solved to achieve both alignment with the preference and PO, at the same time. Specifically, we apply the framework to solve the weighted Chebyshev problem and an extension of that. The former is known as a method to discover the Pareto front, the latter helps to find a model that outperforms an existing model with only one run. Experimental results demonstrate not only the method achieves competitive performance with existing methods, but also it allows us to achieve the performance from different forms of preferences.", "bibtex": "@InProceedings{pmlr-v162-momma22a,\n title = \t {A Multi-objective / Multi-task Learning Framework Induced by Pareto Stationarity},\n author = {Momma, Michinari and Dong, Chaosheng and Liu, Jia},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15895--15907},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/momma22a/momma22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/momma22a.html},\n abstract = \t {Multi-objective optimization (MOO) and multi-task learning (MTL) have gained much popularity with prevalent use cases such as production model development of regression / classification / ranking models with MOO, and training deep learning models with MTL. Despite the long history of research in MOO, its application to machine learning requires development of solution strategy, and algorithms have recently been developed to solve specific problems such as discovery of any Pareto optimal (PO) solution, and that with a particular form of preference. In this paper, we develop a novel and generic framework to discover a PO solution with multiple forms of preferences. It allows us to formulate a generic MOO / MTL problem to express a preference, which is solved to achieve both alignment with the preference and PO, at the same time. Specifically, we apply the framework to solve the weighted Chebyshev problem and an extension of that. The former is known as a method to discover the Pareto front, the latter helps to find a model that outperforms an existing model with only one run. Experimental results demonstrate not only the method achieves competitive performance with existing methods, but also it allows us to achieve the performance from different forms of preferences.}\n}", "pdf": "https://proceedings.mlr.press/v162/momma22a/momma22a.pdf", "supp": "", "pdf_size": 658623, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13234724422986576350&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Amazon.com Inc.; Amazon.com Inc.; The Ohio State University", "aff_domain": "amazon.com;amazon.com;ece.osu.edu", "email": "amazon.com;amazon.com;ece.osu.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/momma22a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Amazon;Ohio State University", "aff_unique_dep": "Amazon.com, Inc.;", "aff_unique_url": "https://www.amazon.com;https://www.osu.edu", "aff_unique_abbr": "Amazon;OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "A Natural Actor-Critic Framework for Zero-Sum Markov Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16709", "id": "16709", "proceeding": "https://proceedings.mlr.press/v162/alacaoglu22a.html", "poster": "/media/PosterPDFs/ICML%202022/09fb05dd477d4ae6479985ca56c5a12d.png?t=1657978338.3256788", "slides": "", "author_site": "Ahmet Alacaoglu, Luca Viano, Niao He, Volkan Cevher", "author": "Ahmet Alacaoglu; Luca Viano; Niao He; Volkan Cevher", "abstract": "We introduce algorithms based on natural actor-critic and analyze their sample complexity for solving two player zero-sum Markov games in the tabular case. Our results improve the best-known sample complexities of policy gradient/actor-critic methods for convergence to Nash equilibrium in the multi-agent setting. We use the error propagation scheme in approximate dynamic programming, recent advances for global convergence of policy gradient methods, temporal difference learning, and techniques from stochastic primal-dual optimization. Our algorithms feature two stages, requiring agents to agree on an etiquette before starting their interactions, which is feasible for instance in self-play. However, the agents only access to joint reward and joint next state and not to each other\u2019s actions or policies. Our complexity results match the best-known results for global convergence of policy gradient algorithms for single agent RL. We provide numerical verification of our methods for a two player bandit environment and a two player game, Alesia. We observe improved empirical performance as compared to the recently proposed optimistic gradient descent-ascent variant for Markov games.", "bibtex": "@InProceedings{pmlr-v162-alacaoglu22a,\n title = \t {A Natural Actor-Critic Framework for Zero-Sum {M}arkov Games},\n author = {Alacaoglu, Ahmet and Viano, Luca and He, Niao and Cevher, Volkan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {307--366},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/alacaoglu22a/alacaoglu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/alacaoglu22a.html},\n abstract = \t {We introduce algorithms based on natural actor-critic and analyze their sample complexity for solving two player zero-sum Markov games in the tabular case. Our results improve the best-known sample complexities of policy gradient/actor-critic methods for convergence to Nash equilibrium in the multi-agent setting. We use the error propagation scheme in approximate dynamic programming, recent advances for global convergence of policy gradient methods, temporal difference learning, and techniques from stochastic primal-dual optimization. Our algorithms feature two stages, requiring agents to agree on an etiquette before starting their interactions, which is feasible for instance in self-play. However, the agents only access to joint reward and joint next state and not to each other\u2019s actions or policies. Our complexity results match the best-known results for global convergence of policy gradient algorithms for single agent RL. We provide numerical verification of our methods for a two player bandit environment and a two player game, Alesia. We observe improved empirical performance as compared to the recently proposed optimistic gradient descent-ascent variant for Markov games.}\n}", "pdf": "https://proceedings.mlr.press/v162/alacaoglu22a/alacaoglu22a.pdf", "supp": "", "pdf_size": 822160, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12917430839617025032&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of Wisconsin-Madison; EPFL; ETH-Z\u00fcrich; EPFL", "aff_domain": "wisc.edu; ; ; ", "email": "wisc.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/alacaoglu22a.html", "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Wisconsin-Madison;EPFL;ETH Zurich", "aff_unique_dep": ";;", "aff_unique_url": "https://www.wisc.edu;https://www.epfl.ch;https://www.ethz.ch", "aff_unique_abbr": "UW-Madison;EPFL;ETHZ", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Madison;;Z\u00fcrich", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;Switzerland" }, { "title": "A Neural Tangent Kernel Perspective of GANs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15999", "id": "15999", "proceeding": "https://proceedings.mlr.press/v162/franceschi22a.html", "poster": "/media/PosterPDFs/ICML%202022/25766f01628f3d34b93a36a2301dffc9.png?t=1657198304.0730932", "slides": "/media/icml-2022/Slides/15999.pdf", "author_site": "Jean-Yves Franceschi, Emmanuel de B\u00e9zenac, Ibrahim Ayed, Mickael Chen, Sylvain Lamprier, Patrick Gallinari", "author": "Jean-Yves Franceschi; Emmanuel De B\u00e9zenac; Ibrahim Ayed; Mickael Chen; Sylvain Lamprier; Patrick Gallinari", "abstract": "We propose a novel theoretical framework of analysis for Generative Adversarial Networks (GANs). We reveal a fundamental flaw of previous analyses which, by incorrectly modeling GANs\u2019 training scheme, are subject to ill-defined discriminator gradients. We overcome this issue which impedes a principled study of GAN training, solving it within our framework by taking into account the discriminator\u2019s architecture. To this end, we leverage the theory of infinite-width neural networks for the discriminator via its Neural Tangent Kernel. We characterize the trained discriminator for a wide range of losses and establish general differentiability properties of the network. From this, we derive new insights about the convergence of the generated distribution, advancing our understanding of GANs\u2019 training dynamics. We empirically corroborate these results via an analysis toolkit based on our framework, unveiling intuitions that are consistent with GAN practice.", "bibtex": "@InProceedings{pmlr-v162-franceschi22a,\n title = \t {A Neural Tangent Kernel Perspective of {GAN}s},\n author = {Franceschi, Jean-Yves and De B{\\'e}zenac, Emmanuel and Ayed, Ibrahim and Chen, Mickael and Lamprier, Sylvain and Gallinari, Patrick},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6660--6704},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/franceschi22a/franceschi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/franceschi22a.html},\n abstract = \t {We propose a novel theoretical framework of analysis for Generative Adversarial Networks (GANs). We reveal a fundamental flaw of previous analyses which, by incorrectly modeling GANs\u2019 training scheme, are subject to ill-defined discriminator gradients. We overcome this issue which impedes a principled study of GAN training, solving it within our framework by taking into account the discriminator\u2019s architecture. To this end, we leverage the theory of infinite-width neural networks for the discriminator via its Neural Tangent Kernel. We characterize the trained discriminator for a wide range of losses and establish general differentiability properties of the network. From this, we derive new insights about the convergence of the generated distribution, advancing our understanding of GANs\u2019 training dynamics. We empirically corroborate these results via an analysis toolkit based on our framework, unveiling intuitions that are consistent with GAN practice.}\n}", "pdf": "https://proceedings.mlr.press/v162/franceschi22a/franceschi22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/franceschi22a-supp.zip", "pdf_size": 3089213, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4606779800346786718&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 19, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/franceschi22a.html" }, { "title": "A New Perspective on the Effects of Spectrum in Graph Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16961", "id": "16961", "proceeding": "https://proceedings.mlr.press/v162/yang22n.html", "poster": "/media/PosterPDFs/ICML%202022/e22dd5dabde45eda5a1a67772c8e25dd.png?t=1657722904.991101", "slides": "", "author_site": "Mingqi Yang, Yanming Shen, Rui Li, Heng Qi, Qiang Zhang, Baocai Yin", "author": "Mingqi Yang; Yanming Shen; Rui Li; Heng Qi; Qiang Zhang; Baocai Yin", "abstract": "Many improvements on GNNs can be deemed as operations on the spectrum of the underlying graph matrix, which motivates us to directly study the characteristics of the spectrum and their effects on GNN performance. By generalizing most existing GNN architectures, we show that the correlation issue caused by the unsmooth spectrum becomes the obstacle to leveraging more powerful graph filters as well as developing deep architectures, which therefore restricts GNNs\u2019 performance. Inspired by this, we propose the correlation-free architecture which naturally removes the correlation issue among different channels, making it possible to utilize more sophisticated filters within each channel. The final correlation-free architecture with more powerful filters consistently boosts the performance of learning graph representations. Code is available at https://github.com/qslim/gnn-spectrum.", "bibtex": "@InProceedings{pmlr-v162-yang22n,\n title = \t {A New Perspective on the Effects of Spectrum in Graph Neural Networks},\n author = {Yang, Mingqi and Shen, Yanming and Li, Rui and Qi, Heng and Zhang, Qiang and Yin, Baocai},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25261--25279},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22n/yang22n.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22n.html},\n abstract = \t {Many improvements on GNNs can be deemed as operations on the spectrum of the underlying graph matrix, which motivates us to directly study the characteristics of the spectrum and their effects on GNN performance. By generalizing most existing GNN architectures, we show that the correlation issue caused by the unsmooth spectrum becomes the obstacle to leveraging more powerful graph filters as well as developing deep architectures, which therefore restricts GNNs\u2019 performance. Inspired by this, we propose the correlation-free architecture which naturally removes the correlation issue among different channels, making it possible to utilize more sophisticated filters within each channel. The final correlation-free architecture with more powerful filters consistently boosts the performance of learning graph representations. Code is available at https://github.com/qslim/gnn-spectrum.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22n/yang22n.pdf", "supp": "", "pdf_size": 1295810, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12355104145181167707&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Dalian University of Technology, China; Dalian University of Technology, China; Dalian University of Technology, China; Dalian University of Technology, China; Dalian University of Technology, China; Dalian University of Technology, China+Peng Cheng Laboratory, China", "aff_domain": "dlut.edu.cn; ; ; ; ; ", "email": "dlut.edu.cn; ; ; ; ; ", "github": "https://github.com/qslim/gnn-spectrum", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/yang22n.html", "aff_unique_index": "0;0;0;0;0;0+1", "aff_unique_norm": "Dalian University of Technology;Pengcheng Laboratory", "aff_unique_dep": ";Peng Cheng Laboratory", "aff_unique_url": "http://www.dlut.edu.cn/;", "aff_unique_abbr": "DUT;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", "aff_country_unique": "China" }, { "title": "A Parametric Class of Approximate Gradient Updates for Policy Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17133", "id": "17133", "proceeding": "https://proceedings.mlr.press/v162/gummadi22a.html", "poster": "/media/PosterPDFs/ICML%202022/21e60123a3a0df92f391f66b1e51903a.png?t=1658170724.215024", "slides": "", "author_site": "Ramki Gummadi, Saurabh Kumar, Junfeng Wen, Dale Schuurmans", "author": "Ramki Gummadi; Saurabh Kumar; Junfeng Wen; Dale Schuurmans", "abstract": "Approaches to policy optimization have been motivated from diverse principles, based on how the parametric model is interpreted (e.g. value versus policy representation) or how the learning objective is formulated, yet they share a common goal of maximizing expected return. To better capture the commonalities and identify key differences between policy optimization methods, we develop a unified perspective that re-expresses the underlying updates in terms of a limited choice of gradient form and scaling function. In particular, we identify a parameterized space of approximate gradient updates for policy optimization that is highly structured, yet covers both classical and recent examples, including PPO. \tAs a result, we obtain novel yet well motivated updates that generalize existing algorithms in a way that can deliver benefits both in terms of convergence speed and final result quality. An experimental investigation demonstrates that the additional degrees of freedom provided in the parameterized family of updates can be leveraged to obtain non-trivial improvements both in synthetic domains and on popular deep RL benchmarks.", "bibtex": "@InProceedings{pmlr-v162-gummadi22a,\n title = \t {A Parametric Class of Approximate Gradient Updates for Policy Optimization},\n author = {Gummadi, Ramki and Kumar, Saurabh and Wen, Junfeng and Schuurmans, Dale},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7998--8015},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gummadi22a/gummadi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gummadi22a.html},\n abstract = \t {Approaches to policy optimization have been motivated from diverse principles, based on how the parametric model is interpreted (e.g. value versus policy representation) or how the learning objective is formulated, yet they share a common goal of maximizing expected return. To better capture the commonalities and identify key differences between policy optimization methods, we develop a unified perspective that re-expresses the underlying updates in terms of a limited choice of gradient form and scaling function. In particular, we identify a parameterized space of approximate gradient updates for policy optimization that is highly structured, yet covers both classical and recent examples, including PPO. \tAs a result, we obtain novel yet well motivated updates that generalize existing algorithms in a way that can deliver benefits both in terms of convergence speed and final result quality. An experimental investigation demonstrates that the additional degrees of freedom provided in the parameterized family of updates can be leveraged to obtain non-trivial improvements both in synthetic domains and on popular deep RL benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/gummadi22a/gummadi22a.pdf", "supp": "", "pdf_size": 1895812, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZrdAbIB_aiMJ:scholar.google.com/&scioq=A+Parametric+Class+of+Approximate+Gradient+Updates+for+Policy+Optimization&hl=en&as_sdt=0,33", "gs_version_total": 7, "aff": "Google Research, Brain Team; Google Research, Brain Team + Stanford University; Layer 6 AI; Google Research, Brain Team + University of Alberta", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/gummadi22a.html", "aff_unique_index": "0;0+1;2;0+3", "aff_unique_norm": "Google;Stanford University;Layer 6 AI;University of Alberta", "aff_unique_dep": "Google Research;;;", "aff_unique_url": "https://research.google;https://www.stanford.edu;https://layer6.ai;https://www.ualberta.ca", "aff_unique_abbr": "Google;Stanford;Layer 6 AI;UAlberta", "aff_campus_unique_index": "0;0+1;0", "aff_campus_unique": "Mountain View;Stanford;", "aff_country_unique_index": "0;0+0;1;0+1", "aff_country_unique": "United States;Canada" }, { "title": "A Psychological Theory of Explainability", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16249", "id": "16249", "proceeding": "https://proceedings.mlr.press/v162/yang22c.html", "poster": "/media/PosterPDFs/ICML%202022/ce47be4abd80ac324c645fd57a27fc73.png?t=1657178572.5826976", "slides": "", "author_site": "Scott Cheng-Hsin Yang, Nils Erik Tomas Folke, Patrick Shafto", "author": "Scott Cheng-Hsin Yang; Nils Erik Tomas Folke; Patrick Shafto", "abstract": "The goal of explainable Artificial Intelligence (XAI) is to generate human-interpretable explanations, but there are no computationally precise theories of how humans interpret AI generated explanations. The lack of theory means that validation of XAI must be done empirically, on a case-by-case basis, which prevents systematic theory-building in XAI. We propose a psychological theory of how humans draw conclusions from saliency maps, the most common form of XAI explanation, which for the first time allows for precise prediction of explainee inference conditioned on explanation. Our theory posits that absent explanation humans expect the AI to make similar decisions to themselves, and that they interpret an explanation by comparison to the explanations they themselves would give. Comparison is formalized via Shepard\u2019s universal law of generalization in a similarity space, a classic theory from cognitive science. A pre-registered user study on AI image classifications with saliency map explanations demonstrate that our theory quantitatively matches participants\u2019 predictions of the AI.", "bibtex": "@InProceedings{pmlr-v162-yang22c,\n title = \t {A Psychological Theory of Explainability},\n author = {Yang, Scott Cheng-Hsin and Folke, Nils Erik Tomas and Shafto, Patrick},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25007--25021},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22c/yang22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22c.html},\n abstract = \t {The goal of explainable Artificial Intelligence (XAI) is to generate human-interpretable explanations, but there are no computationally precise theories of how humans interpret AI generated explanations. The lack of theory means that validation of XAI must be done empirically, on a case-by-case basis, which prevents systematic theory-building in XAI. We propose a psychological theory of how humans draw conclusions from saliency maps, the most common form of XAI explanation, which for the first time allows for precise prediction of explainee inference conditioned on explanation. Our theory posits that absent explanation humans expect the AI to make similar decisions to themselves, and that they interpret an explanation by comparison to the explanations they themselves would give. Comparison is formalized via Shepard\u2019s universal law of generalization in a similarity space, a classic theory from cognitive science. A pre-registered user study on AI image classifications with saliency map explanations demonstrate that our theory quantitatively matches participants\u2019 predictions of the AI.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22c/yang22c.pdf", "supp": "", "pdf_size": 1967700, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12918730794535682130&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Mathematics and Computer Science, Rutgers University\u2013Newark, New Jersey, USA+School of Mathematics, Institute for Advanced Study, New Jersey, USA; Department of Mathematics and Computer Science, Rutgers University\u2013Newark, New Jersey, USA+School of Mathematics, Institute for Advanced Study, New Jersey, USA; Department of Mathematics and Computer Science, Rutgers University\u2013Newark, New Jersey, USA", "aff_domain": "gmail.com;gmail.com; ", "email": "gmail.com;gmail.com; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/yang22c.html", "aff_unique_index": "0+1;0+1;0", "aff_unique_norm": "Rutgers University\u2013Newark;Institute for Advanced Study", "aff_unique_dep": "Department of Mathematics and Computer Science;School of Mathematics", "aff_unique_url": "https://www.rutgers.edu;https://www.ias.edu", "aff_unique_abbr": "Rutgers;IAS", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Newark;", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "A Random Matrix Analysis of Data Stream Clustering: Coping With Limited Memory Resources", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16367", "id": "16367", "proceeding": "https://proceedings.mlr.press/v162/lebeau22a.html", "poster": "/media/PosterPDFs/ICML%202022/d693d554e0ede0d75f7d2873b015f228.png?t=1657723835.6164029", "slides": "/media/icml-2022/Slides/16367.pdf", "author_site": "Hugo Lebeau, Romain Couillet, Florent Chatelain", "author": "Hugo Lebeau; Romain Couillet; Florent Chatelain", "abstract": "This article introduces a random matrix framework for the analysis of clustering on high-dimensional data streams, a particularly relevant setting for a more sober processing of large amounts of data with limited memory and energy resources. Assuming data $\\mathbf{x}_1, \\mathbf{x}_2, \\ldots$ arrives as a continuous flow and a small number $L$ of them can be kept in the learning pipeline, one has only access to the diagonal elements of the Gram kernel matrix: $\\left[ \\mathbf{K}_L \\right]_{i, j} = \\frac{1}{p} \\mathbf{x}_i^\\top \\mathbf{x}_j \\mathbf{1}_{\\left\\lvert i - j \\right\\rvert < L}$. Under a large-dimensional data regime, we derive the limiting spectral distribution of the banded kernel matrix $\\mathbf{K}_L$ and study its isolated eigenvalues and eigenvectors, which behave in an unfamiliar way. We detail how these results can be used to perform efficient online kernel spectral clustering and provide theoretical performance guarantees. Our findings are empirically confirmed on image clustering tasks. Leveraging on optimality results of spectral methods for clustering, this work offers insights on efficient online clustering techniques for high-dimensional data.", "bibtex": "@InProceedings{pmlr-v162-lebeau22a,\n title = \t {A Random Matrix Analysis of Data Stream Clustering: Coping With Limited Memory Resources},\n author = {Lebeau, Hugo and Couillet, Romain and Chatelain, Florent},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12253--12281},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lebeau22a/lebeau22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lebeau22a.html},\n abstract = \t {This article introduces a random matrix framework for the analysis of clustering on high-dimensional data streams, a particularly relevant setting for a more sober processing of large amounts of data with limited memory and energy resources. Assuming data $\\mathbf{x}_1, \\mathbf{x}_2, \\ldots$ arrives as a continuous flow and a small number $L$ of them can be kept in the learning pipeline, one has only access to the diagonal elements of the Gram kernel matrix: $\\left[ \\mathbf{K}_L \\right]_{i, j} = \\frac{1}{p} \\mathbf{x}_i^\\top \\mathbf{x}_j \\mathbf{1}_{\\left\\lvert i - j \\right\\rvert < L}$. Under a large-dimensional data regime, we derive the limiting spectral distribution of the banded kernel matrix $\\mathbf{K}_L$ and study its isolated eigenvalues and eigenvectors, which behave in an unfamiliar way. We detail how these results can be used to perform efficient online kernel spectral clustering and provide theoretical performance guarantees. Our findings are empirically confirmed on image clustering tasks. Leveraging on optimality results of spectral methods for clustering, this work offers insights on efficient online clustering techniques for high-dimensional data.}\n}", "pdf": "https://proceedings.mlr.press/v162/lebeau22a/lebeau22a.pdf", "supp": "", "pdf_size": 2454320, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6286728850096336619&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LIG, 38000 Grenoble, France; Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LIG, 38000 Grenoble, France; Univ. Grenoble Alpes, CNRS, Grenoble INP, GIPSA-lab, 38000 Grenoble, France", "aff_domain": "univ-grenoble-alpes.fr; ; ", "email": "univ-grenoble-alpes.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/lebeau22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Universite Grenoble Alpes", "aff_unique_dep": "", "aff_unique_url": "https://www.univ-grenoble-alpes.fr", "aff_unique_abbr": "UGA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Grenoble", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "A Reduction from Linear Contextual Bandits Lower Bounds to Estimations Lower Bounds", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17007", "id": "17007", "proceeding": "https://proceedings.mlr.press/v162/he22e.html", "poster": "/media/PosterPDFs/ICML%202022/497476fe61816251905e8baafdf54c23.png?t=1656173787.4591198", "slides": "", "author_site": "Jiahao He, Jiheng Zhang, Rachel Q. Zhang", "author": "Jiahao He; Jiheng Zhang; Rachel Q. Zhang", "abstract": "Linear contextual bandits and their variants are usually solved using algorithms guided by parameter estimation. Cauchy-Schwartz inequality established that estimation errors dominate algorithm regrets, and thus, accurate estimators suffice to guarantee algorithms with low regrets. In this paper, we complete the reverse direction by establishing the necessity. In particular, we provide a generic transformation from algorithms for linear contextual bandits to estimators for linear models, and show that algorithm regrets dominate estimation errors of their induced estimators, i.e., low-regret algorithms must imply accurate estimators. Moreover, our analysis reduces the regret lower bound to an estimation error, bridging the lower bound analysis in linear contextual bandit problems and linear regression.", "bibtex": "@InProceedings{pmlr-v162-he22e,\n title = \t {A Reduction from Linear Contextual Bandit Lower Bounds to Estimation Lower Bounds},\n author = {He, Jiahao and Zhang, Jiheng and Zhang, Rachel Q.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8660--8677},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/he22e/he22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/he22e.html},\n abstract = \t {Linear contextual bandits and their variants are usually solved using algorithms guided by parameter estimation. Cauchy-Schwartz inequality established that estimation errors dominate algorithm regrets, and thus, accurate estimators suffice to guarantee algorithms with low regrets. In this paper, we complete the reverse direction by establishing the necessity. In particular, we provide a generic transformation from algorithms for linear contextual bandits to estimators for linear models, and show that algorithm regrets dominate estimation errors of their induced estimators, i.e., low-regret algorithms must imply accurate estimators. Moreover, our analysis reduces the regret lower bound to an estimation error, bridging the lower bound analysis in linear contextual bandit problems and linear regression.}\n}", "pdf": "https://proceedings.mlr.press/v162/he22e/he22e.pdf", "supp": "", "pdf_size": 487521, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17830744373865348774&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Department of Industrial Engineering and Decision Analytics, The Hong Kong University of Science and Technology, Clear Water Bay, Kowloon, Hong Kong, China; Department of Industrial Engineering and Decision Analytics, The Hong Kong University of Science and Technology, Clear Water Bay, Kowloon, Hong Kong, China; Department of Industrial Engineering and Decision Analytics, The Hong Kong University of Science and Technology, Clear Water Bay, Kowloon, Hong Kong, China", "aff_domain": "connect.ust.hk; ; ", "email": "connect.ust.hk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/he22e.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "Department of Industrial Engineering and Decision Analytics", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Clear Water Bay", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "A Regret Minimization Approach to Multi-Agent Control", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17035", "id": "17035", "proceeding": "https://proceedings.mlr.press/v162/ghai22a.html", "poster": "/media/PosterPDFs/ICML%202022/7ee6f2b3b68a212d3b7a4f6557eb8cc7.png?t=1657506333.1541424", "slides": "", "author_site": "Udaya Ghai, Udari Madhuhshani, Naomi Leonard, Elad Hazan", "author": "Udaya Ghai; Udari Madhushani; Naomi Leonard; Elad Hazan", "abstract": "We study the problem of multi-agent control of a dynamical system with known dynamics and adversarial disturbances. Our study focuses on optimal control without centralized precomputed policies, but rather with adaptive control policies for the different agents that are only equipped with a stabilizing controller. We give a reduction from any (standard) regret minimizing control method to a distributed algorithm. The reduction guarantees that the resulting distributed algorithm has low regret relative to the optimal precomputed joint policy. Our methodology involves generalizing online convex optimization to a multi-agent setting and applying recent tools from nonstochastic control derived for a single agent. We empirically evaluate our method on a model of an overactuated aircraft. We show that the distributed method is robust to failure and to adversarial perturbations in the dynamics.", "bibtex": "@InProceedings{pmlr-v162-ghai22a,\n title = \t {A Regret Minimization Approach to Multi-Agent Control},\n author = {Ghai, Udaya and Madhushani, Udari and Leonard, Naomi and Hazan, Elad},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7422--7434},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ghai22a/ghai22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ghai22a.html},\n abstract = \t {We study the problem of multi-agent control of a dynamical system with known dynamics and adversarial disturbances. Our study focuses on optimal control without centralized precomputed policies, but rather with adaptive control policies for the different agents that are only equipped with a stabilizing controller. We give a reduction from any (standard) regret minimizing control method to a distributed algorithm. The reduction guarantees that the resulting distributed algorithm has low regret relative to the optimal precomputed joint policy. Our methodology involves generalizing online convex optimization to a multi-agent setting and applying recent tools from nonstochastic control derived for a single agent. We empirically evaluate our method on a model of an overactuated aircraft. We show that the distributed method is robust to failure and to adversarial perturbations in the dynamics.}\n}", "pdf": "https://proceedings.mlr.press/v162/ghai22a/ghai22a.pdf", "supp": "", "pdf_size": 403608, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8734345102262559111&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Princeton University, Princeton, NJ + Google AI Princeton, Princeton, NJ; Department of Mechanical and Aerospace Engineering, Princeton University, Princeton, NJ; Department of Mechanical and Aerospace Engineering, Princeton University, Princeton, NJ; Department of Computer Science, Princeton University, Princeton, NJ + Google AI Princeton, Princeton, NJ", "aff_domain": "cs.princeton.edu; ; ; ", "email": "cs.princeton.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/ghai22a.html", "aff_unique_index": "0+1;0;0;0+1", "aff_unique_norm": "Princeton University;Google", "aff_unique_dep": "Department of Computer Science;Google AI", "aff_unique_url": "https://www.princeton.edu;https://ai.google", "aff_unique_abbr": "Princeton;Google AI", "aff_campus_unique_index": "0+0;0;0;0+0", "aff_campus_unique": "Princeton", "aff_country_unique_index": "0+0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "A Resilient Distributed Boosting Algorithm", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15981", "id": "15981", "proceeding": "https://proceedings.mlr.press/v162/filmus22a.html", "poster": "", "slides": "", "author_site": "Yuval filmus, Idan Mehalel, Shay Moran", "author": "Yuval Filmus; Idan Mehalel; Shay Moran", "abstract": "Given a learning task where the data is distributed among several parties, communication is one of the fundamental resources which the parties would like to minimize. We present a distributed boosting algorithm which is resilient to a limited amount of noise. Our algorithm is similar to classical boosting algorithms, although it is equipped with a new component, inspired by Impagliazzo\u2019s hard-core lemma (Impagliazzo, 1995), adding a robustness quality to the algorithm. We also complement this result by showing that resilience to any asymptotically larger noise is not achievable by a communication-efficient algorithm.", "bibtex": "@InProceedings{pmlr-v162-filmus22a,\n title = \t {A Resilient Distributed Boosting Algorithm},\n author = {Filmus, Yuval and Mehalel, Idan and Moran, Shay},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6465--6473},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/filmus22a/filmus22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/filmus22a.html},\n abstract = \t {Given a learning task where the data is distributed among several parties, communication is one of the fundamental resources which the parties would like to minimize. We present a distributed boosting algorithm which is resilient to a limited amount of noise. Our algorithm is similar to classical boosting algorithms, although it is equipped with a new component, inspired by Impagliazzo\u2019s hard-core lemma (Impagliazzo, 1995), adding a robustness quality to the algorithm. We also complement this result by showing that resilience to any asymptotically larger noise is not achievable by a communication-efficient algorithm.}\n}", "pdf": "https://proceedings.mlr.press/v162/filmus22a/filmus22a.pdf", "supp": "", "pdf_size": 329342, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3676488938238523081&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "The Henry and Marilyn Taub Faculty of Computer Science, Technion, Haifa, Israel+Faculty of Mathematics, Technion, Haifa, Israel+Google Research, Israel; The Henry and Marilyn Taub Faculty of Computer Science, Technion, Haifa, Israel+Faculty of Mathematics, Technion, Haifa, Israel+Google Research, Israel; The Henry and Marilyn Taub Faculty of Computer Science, Technion, Haifa, Israel+Faculty of Mathematics, Technion, Haifa, Israel+Google Research, Israel", "aff_domain": "cs.technion.ac.il;cs.technion.ac.il;cs.technion.ac.il", "email": "cs.technion.ac.il;cs.technion.ac.il;cs.technion.ac.il", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/filmus22a.html", "aff_unique_index": "0+0+1;0+0+1;0+0+1", "aff_unique_norm": "Technion;Google", "aff_unique_dep": "Faculty of Computer Science;Google Research", "aff_unique_url": "https://www.technion.ac.il;https://research.google", "aff_unique_abbr": "Technion;Google", "aff_campus_unique_index": "0+0+1;0+0+1;0+0+1", "aff_campus_unique": "Haifa;Israel", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0", "aff_country_unique": "Israel" }, { "title": "A Rigorous Study of Integrated Gradients Method and Extensions to Internal Neuron Attributions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17261", "id": "17261", "proceeding": "https://proceedings.mlr.press/v162/lundstrom22a.html", "poster": "", "slides": "", "author_site": "Daniel Lundstrom, Tianjian Huang, Meisam Razaviyayn", "author": "Daniel D Lundstrom; Tianjian Huang; Meisam Razaviyayn", "abstract": "As deep learning (DL) efficacy grows, concerns for poor model explainability grow also. Attribution methods address the issue of explainability by quantifying the importance of an input feature for a model prediction. Among various methods, Integrated Gradients (IG) sets itself apart by claiming other methods failed to satisfy desirable axioms, while IG and methods like it uniquely satisfy said axioms. This paper comments on fundamental aspects of IG and its applications/extensions: 1) We identify key differences between IG function spaces and the supporting literature\u2019s function spaces which problematize previous claims of IG uniqueness. We show that with the introduction of an additional axiom, non-decreasing positivity, the uniqueness claims can be established. 2) We address the question of input sensitivity by identifying function classes where IG is/is not Lipschitz in the attributed input. 3) We show that axioms for single-baseline methods have analogous properties for methods with probability distribution baselines. 4) We introduce a computationally efficient method of identifying internal neurons that contribute to specified regions of an IG attribution map. Finally, we present experimental results validating this method.", "bibtex": "@InProceedings{pmlr-v162-lundstrom22a,\n title = \t {A Rigorous Study of Integrated Gradients Method and Extensions to Internal Neuron Attributions},\n author = {Lundstrom, Daniel D and Huang, Tianjian and Razaviyayn, Meisam},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14485--14508},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lundstrom22a/lundstrom22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lundstrom22a.html},\n abstract = \t {As deep learning (DL) efficacy grows, concerns for poor model explainability grow also. Attribution methods address the issue of explainability by quantifying the importance of an input feature for a model prediction. Among various methods, Integrated Gradients (IG) sets itself apart by claiming other methods failed to satisfy desirable axioms, while IG and methods like it uniquely satisfy said axioms. This paper comments on fundamental aspects of IG and its applications/extensions: 1) We identify key differences between IG function spaces and the supporting literature\u2019s function spaces which problematize previous claims of IG uniqueness. We show that with the introduction of an additional axiom, non-decreasing positivity, the uniqueness claims can be established. 2) We address the question of input sensitivity by identifying function classes where IG is/is not Lipschitz in the attributed input. 3) We show that axioms for single-baseline methods have analogous properties for methods with probability distribution baselines. 4) We introduce a computationally efficient method of identifying internal neurons that contribute to specified regions of an IG attribution map. Finally, we present experimental results validating this method.}\n}", "pdf": "https://proceedings.mlr.press/v162/lundstrom22a/lundstrom22a.pdf", "supp": "", "pdf_size": 1520614, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2734810007243082678&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "University of Southern California; University of Southern California; University of Southern California", "aff_domain": "usc.edu; ; ", "email": "usc.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/lundstrom22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "A Self-Play Posterior Sampling Algorithm for Zero-Sum Markov Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16781", "id": "16781", "proceeding": "https://proceedings.mlr.press/v162/xiong22b.html", "poster": "/media/PosterPDFs/ICML%202022/d16509f6eaca1022bd8f28d6bc582cae.png?t=1658190579.8064237", "slides": "/media/icml-2022/Slides/16781.pdf", "author_site": "Wei Xiong, Han Zhong, Chengshuai Shi, Cong Shen, Tong Zhang", "author": "Wei Xiong; Han Zhong; Chengshuai Shi; Cong Shen; Tong Zhang", "abstract": "Existing studies on provably efficient algorithms for Markov games (MGs) almost exclusively build on the \u201coptimism in the face of uncertainty\u201d (OFU) principle. This work focuses on a distinct approach of posterior sampling, which is celebrated in many bandits and reinforcement learning settings but remains under-explored for MGs. Specifically, for episodic two-player zero-sum MGs, a novel posterior sampling algorithm is developed with", "bibtex": "@InProceedings{pmlr-v162-xiong22b,\n title = \t {A Self-Play Posterior Sampling Algorithm for Zero-Sum {M}arkov Games},\n author = {Xiong, Wei and Zhong, Han and Shi, Chengshuai and Shen, Cong and Zhang, Tong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24496--24523},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xiong22b/xiong22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/xiong22b.html},\n abstract = \t {Existing studies on provably efficient algorithms for Markov games (MGs) almost exclusively build on the \u201coptimism in the face of uncertainty\u201d (OFU) principle. This work focuses on a distinct approach of posterior sampling, which is celebrated in many bandits and reinforcement learning settings but remains under-explored for MGs. Specifically, for episodic two-player zero-sum MGs, a novel posterior sampling algorithm is developed with", "pdf": "https://proceedings.mlr.press/v162/xiong22b/xiong22b.pdf", "supp": "", "pdf_size": 458992, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11738362655884072094&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "The Hong Kong University of Science and Technology; Center for Data Science, Peking University; University of Virginia; University of Virginia; The Hong Kong University of Science and Technology + Google Research", "aff_domain": "ust.hk;pku.edu.cn;virginia.edu;virginia.edu;tongzhang-ml.org", "email": "ust.hk;pku.edu.cn;virginia.edu;virginia.edu;tongzhang-ml.org", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/xiong22b.html", "aff_unique_index": "0;1;2;2;0+3", "aff_unique_norm": "Hong Kong University of Science and Technology;Peking University;University of Virginia;Google", "aff_unique_dep": ";Center for Data Science;;Google Research", "aff_unique_url": "https://www.ust.hk;http://www.pku.edu.cn;https://www.virginia.edu;https://research.google", "aff_unique_abbr": "HKUST;PKU;UVA;Google Research", "aff_campus_unique_index": "0;1;0+3", "aff_campus_unique": "Hong Kong SAR;Beijing;;Mountain View", "aff_country_unique_index": "0;0;1;1;0+1", "aff_country_unique": "China;United States" }, { "title": "A Simple Guard for Learned Optimizers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17027", "id": "17027", "proceeding": "https://proceedings.mlr.press/v162/premont-schwarz22a.html", "poster": "/media/PosterPDFs/ICML%202022/a4d41b834ea903526373a9a1ae2ac66e.png?t=1657832711.2952354", "slides": "/media/icml-2022/Slides/17027.pdf", "author_site": "Isabeau Pr\u00e9mont-Schwarz, Jaroslav V\u00edtk\u016f, Jan Feyereisl", "author": "Isabeau Pr\u00e9mont-Schwarz; Jaroslav V\u0131\u0301tk\u016f; Jan Feyereisl", "abstract": "If the trend of learned components eventually outperforming their hand-crafted version continues, learned optimizers will eventually outperform hand-crafted optimizers like SGD or Adam. Even if learned optimizers (L2Os) eventually outpace hand-crafted ones in practice however, they are still not provably convergent and might fail out of distribution. These are the questions addressed here. Currently, learned optimizers frequently outperform generic hand-crafted optimizers (such as gradient descent) at the beginning of learning but they generally plateau after some time while the generic algorithms continue to make progress and often overtake the learned algorithm as Aesop\u2019s tortoise which overtakes the hare. L2Os also still have a difficult time generalizing out of distribution. \\cite{heaton_safeguarded_2020} proposed Safeguarded L2O (GL2O) which can take a learned optimizer and safeguard it with a generic learning algorithm so that by conditionally switching between the two, the resulting algorithm is provably convergent. We propose a new class of Safeguarded L2O, called Loss-Guarded L2O (LGL2O), which is both conceptually simpler and computationally less expensive. The guarding mechanism decides solely based on the expected future loss value of both optimizers. Furthermore, we show theoretical proof of LGL2O\u2019s convergence guarantee and empirical results comparing to GL2O and other baselines showing that it combines the best of both L2O and SGD and that in practice converges much better than GL2O.", "bibtex": "@InProceedings{pmlr-v162-premont-schwarz22a,\n title = \t {A Simple Guard for Learned Optimizers},\n author = {Pr{\\'e}mont-Schwarz, Isabeau and V\\'{\\i}tk{\\r{u}}, Jaroslav and Feyereisl, Jan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17910--17925},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/premont-schwarz22a/premont-schwarz22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/premont-schwarz22a.html},\n abstract = \t {If the trend of learned components eventually outperforming their hand-crafted version continues, learned optimizers will eventually outperform hand-crafted optimizers like SGD or Adam. Even if learned optimizers (L2Os) eventually outpace hand-crafted ones in practice however, they are still not provably convergent and might fail out of distribution. These are the questions addressed here. Currently, learned optimizers frequently outperform generic hand-crafted optimizers (such as gradient descent) at the beginning of learning but they generally plateau after some time while the generic algorithms continue to make progress and often overtake the learned algorithm as Aesop\u2019s tortoise which overtakes the hare. L2Os also still have a difficult time generalizing out of distribution. \\cite{heaton_safeguarded_2020} proposed Safeguarded L2O (GL2O) which can take a learned optimizer and safeguard it with a generic learning algorithm so that by conditionally switching between the two, the resulting algorithm is provably convergent. We propose a new class of Safeguarded L2O, called Loss-Guarded L2O (LGL2O), which is both conceptually simpler and computationally less expensive. The guarding mechanism decides solely based on the expected future loss value of both optimizers. Furthermore, we show theoretical proof of LGL2O\u2019s convergence guarantee and empirical results comparing to GL2O and other baselines showing that it combines the best of both L2O and SGD and that in practice converges much better than GL2O.}\n}", "pdf": "https://proceedings.mlr.press/v162/premont-schwarz22a/premont-schwarz22a.pdf", "supp": "", "pdf_size": 10973941, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10064627005172671510&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Good AI, Prague, Czechia; Good AI, Prague, Czechia; Good AI, Prague, Czechia", "aff_domain": "goodai.com;goodai.com; ", "email": "goodai.com;goodai.com; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/premont-schwarz22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Good AI", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Prague", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Czechia" }, { "title": "A Simple Reward-free Approach to Constrained Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16775", "id": "16775", "proceeding": "https://proceedings.mlr.press/v162/miryoosefi22a.html", "poster": "/media/PosterPDFs/ICML%202022/2adafb1b5d684e6c15a2d063367be012.png?t=1657766837.9070892", "slides": "", "author_site": "Sobhan Miryoosefi, Chi Jin", "author": "Sobhan Miryoosefi; Chi Jin", "abstract": "In constrained reinforcement learning (RL), a learning agent seeks to not only optimize the overall reward but also satisfy the additional safety, diversity, or budget constraints. Consequently, existing constrained RL solutions require several new algorithmic ingredients that are notably different from standard RL. On the other hand, reward-free RL is independently developed in the unconstrained literature, which learns the transition dynamics without using the reward information, and thus naturally capable of addressing RL with multiple objectives under the common dynamics. This paper bridges reward-free RL and constrained RL. Particularly, we propose a simple meta-algorithm such that given any reward-free RL oracle, the approachability and constrained RL problems can be directly solved with negligible overheads in sample complexity. Utilizing the existing reward-free RL solvers, our framework provides sharp sample complexity results for constrained RL in the tabular MDP setting, matching the best existing results up to a factor of horizon dependence; our framework directly extends to a setting of tabular two-player Markov games, and gives a new result for constrained RL with linear function approximation.", "bibtex": "@InProceedings{pmlr-v162-miryoosefi22a,\n title = \t {A Simple Reward-free Approach to Constrained Reinforcement Learning},\n author = {Miryoosefi, Sobhan and Jin, Chi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15666--15698},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/miryoosefi22a/miryoosefi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/miryoosefi22a.html},\n abstract = \t {In constrained reinforcement learning (RL), a learning agent seeks to not only optimize the overall reward but also satisfy the additional safety, diversity, or budget constraints. Consequently, existing constrained RL solutions require several new algorithmic ingredients that are notably different from standard RL. On the other hand, reward-free RL is independently developed in the unconstrained literature, which learns the transition dynamics without using the reward information, and thus naturally capable of addressing RL with multiple objectives under the common dynamics. This paper bridges reward-free RL and constrained RL. Particularly, we propose a simple meta-algorithm such that given any reward-free RL oracle, the approachability and constrained RL problems can be directly solved with negligible overheads in sample complexity. Utilizing the existing reward-free RL solvers, our framework provides sharp sample complexity results for constrained RL in the tabular MDP setting, matching the best existing results up to a factor of horizon dependence; our framework directly extends to a setting of tabular two-player Markov games, and gives a new result for constrained RL with linear function approximation.}\n}", "pdf": "https://proceedings.mlr.press/v162/miryoosefi22a/miryoosefi22a.pdf", "supp": "", "pdf_size": 491084, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13936124106083293459&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Princeton University; Princeton University", "aff_domain": "cs.princeton.edu; ", "email": "cs.princeton.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/miryoosefi22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Simple Unified Framework for High Dimensional Bandit Problems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16793", "id": "16793", "proceeding": "https://proceedings.mlr.press/v162/li22a.html", "poster": "/media/PosterPDFs/ICML%202022/12b1e42dc0746f22cf361267de07073f.png?t=1657167700.2415485", "slides": "", "author_site": "Wenjie Li, Adarsh Barik, Jean Honorio", "author": "Wenjie Li; Adarsh Barik; Jean Honorio", "abstract": "Stochastic high dimensional bandit problems with low dimensional structures are useful in different applications such as online advertising and drug discovery. In this work, we propose a simple unified algorithm for such problems and present a general analysis framework for the regret upper bound of our algorithm. We show that under some mild unified assumptions, our algorithm can be applied to different high-dimensional bandit problems. Our framework utilizes the low dimensional structure to guide the parameter estimation in the problem, therefore our algorithm achieves the comparable regret bounds in the LASSO bandit as a sanity check, as well as novel bounds that depend logarithmically on dimensions in the low-rank matrix bandit, the group sparse matrix bandit, and in a new problem: the multi-agent LASSO bandit.", "bibtex": "@InProceedings{pmlr-v162-li22a,\n title = \t {A Simple Unified Framework for High Dimensional Bandit Problems},\n author = {Li, Wenjie and Barik, Adarsh and Honorio, Jean},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12619--12655},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22a/li22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22a.html},\n abstract = \t {Stochastic high dimensional bandit problems with low dimensional structures are useful in different applications such as online advertising and drug discovery. In this work, we propose a simple unified algorithm for such problems and present a general analysis framework for the regret upper bound of our algorithm. We show that under some mild unified assumptions, our algorithm can be applied to different high-dimensional bandit problems. Our framework utilizes the low dimensional structure to guide the parameter estimation in the problem, therefore our algorithm achieves the comparable regret bounds in the LASSO bandit as a sanity check, as well as novel bounds that depend logarithmically on dimensions in the low-rank matrix bandit, the group sparse matrix bandit, and in a new problem: the multi-agent LASSO bandit.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22a/li22a.pdf", "supp": "", "pdf_size": 906307, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3777279565746719431&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Statistics, Purdue University; Department of Computer Science, Purdue University; Department of Computer Science, Purdue University", "aff_domain": "purdue.edu; ;purdue.edu", "email": "purdue.edu; ;purdue.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/li22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "A Simple yet Universal Strategy for Online Convex Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17431", "id": "17431", "proceeding": "https://proceedings.mlr.press/v162/zhang22af.html", "poster": "/media/PosterPDFs/ICML%202022/21be992eb8016e541a15953eee90760e_OwFUJEh.png?t=1658120913.7945597", "slides": "", "author_site": "Lijun Zhang, Guanghui Wang, Jinfeng Yi, Tianbao Yang", "author": "Lijun Zhang; Guanghui Wang; Jinfeng Yi; Tianbao Yang", "abstract": "Recently, several universal methods have been proposed for online convex optimization, and attain minimax rates for multiple types of convex functions simultaneously. However, they need to design and optimize one surrogate loss for each type of functions, making it difficult to exploit the structure of the problem and utilize existing algorithms. In this paper, we propose a simple strategy for universal online convex optimization, which avoids these limitations. The key idea is to construct a set of experts to process the original online functions, and deploy a meta-algorithm over the linearized losses to aggregate predictions from experts. Specifically, the meta-algorithm is required to yield a second-order bound with excess losses, so that it can leverage strong convexity and exponential concavity to control the meta-regret. In this way, our strategy inherits the theoretical guarantee of any expert designed for strongly convex functions and exponentially concave functions, up to a double logarithmic factor. As a result, we can plug in off-the-shelf online solvers as black-box experts to deliver problem-dependent regret bounds. For general convex functions, it maintains the minimax optimality and also achieves a small-loss bound.", "bibtex": "@InProceedings{pmlr-v162-zhang22af,\n title = \t {A Simple yet Universal Strategy for Online Convex Optimization},\n author = {Zhang, Lijun and Wang, Guanghui and Yi, Jinfeng and Yang, Tianbao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26605--26623},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22af/zhang22af.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22af.html},\n abstract = \t {Recently, several universal methods have been proposed for online convex optimization, and attain minimax rates for multiple types of convex functions simultaneously. However, they need to design and optimize one surrogate loss for each type of functions, making it difficult to exploit the structure of the problem and utilize existing algorithms. In this paper, we propose a simple strategy for universal online convex optimization, which avoids these limitations. The key idea is to construct a set of experts to process the original online functions, and deploy a meta-algorithm over the linearized losses to aggregate predictions from experts. Specifically, the meta-algorithm is required to yield a second-order bound with excess losses, so that it can leverage strong convexity and exponential concavity to control the meta-regret. In this way, our strategy inherits the theoretical guarantee of any expert designed for strongly convex functions and exponentially concave functions, up to a double logarithmic factor. As a result, we can plug in off-the-shelf online solvers as black-box experts to deliver problem-dependent regret bounds. For general convex functions, it maintains the minimax optimality and also achieves a small-loss bound.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22af/zhang22af.pdf", "supp": "", "pdf_size": 673930, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5241984188870982900&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China+Peng Cheng Laboratory, Shenzhen, China; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China; Frontis.AI, Beijing, China; Department of Computer Science, The University of Iowa, Iowa City, USA", "aff_domain": "lamda.nju.edu.cn; ; ; ", "email": "lamda.nju.edu.cn; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhang22af.html", "aff_unique_index": "0+1;0;2;3", "aff_unique_norm": "Nanjing University;Pengcheng Laboratory;Frontis.AI;University of Iowa", "aff_unique_dep": "National Key Laboratory for Novel Software Technology;Peng Cheng Laboratory;;Department of Computer Science", "aff_unique_url": "http://www.nju.edu.cn;;;https://www.uiowa.edu", "aff_unique_abbr": "Nanjing U;;;UIowa", "aff_campus_unique_index": "0+1;0;2;3", "aff_campus_unique": "Nanjing;Shenzhen;Beijing;Iowa City", "aff_country_unique_index": "0+0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "A Single-Loop Gradient Descent and Perturbed Ascent Algorithm for Nonconvex Functional Constrained Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18161", "id": "18161", "proceeding": "https://proceedings.mlr.press/v162/lu22a.html", "poster": "/media/PosterPDFs/ICML%202022/7a0c1035bcb33b6d86942ecbedb37267.png?t=1657617447.3371758", "slides": "", "author": "Songtao Lu", "abstract": "Nonconvex constrained optimization problems can be used to model a number of machine learning problems, such as multi-class Neyman-Pearson classification and constrained Markov decision processes. However, such kinds of problems are challenging because both the objective and constraints are possibly nonconvex, so it is difficult to balance the reduction of the loss value and reduction of constraint violation. Although there are a few methods that solve this class of problems, all of them are double-loop or triple-loop algorithms, and they require oracles to solve some subproblems up to certain accuracy by tuning multiple hyperparameters at each iteration. In this paper, we propose a novel gradient descent and perturbed ascent (GDPA) algorithm to solve a class of smooth nonconvex inequality constrained problems. The GDPA is a primal-dual algorithm, which only exploits the first-order information of both the objective and constraint functions to update the primal and dual variables in an alternating way. The key feature of the proposed algorithm is that it is a single-loop algorithm, where only two step-sizes need to be tuned. We show that under a mild regularity condition GDPA is able to find Karush-Kuhn-Tucker (KKT) points of nonconvex functional constrained problems with convergence rate guarantees. To the best of our knowledge, it is the first single-loop algorithm that can solve the general nonconvex smooth problems with nonconvex inequality constraints. Numerical results also showcase the superiority of GDPA compared with the best-known algorithms (in terms of both stationarity measure and feasibility of the obtained solutions).", "bibtex": "@InProceedings{pmlr-v162-lu22a,\n title = \t {A Single-Loop Gradient Descent and Perturbed Ascent Algorithm for Nonconvex Functional Constrained Optimization},\n author = {Lu, Songtao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14315--14357},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lu22a/lu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lu22a.html},\n abstract = \t {Nonconvex constrained optimization problems can be used to model a number of machine learning problems, such as multi-class Neyman-Pearson classification and constrained Markov decision processes. However, such kinds of problems are challenging because both the objective and constraints are possibly nonconvex, so it is difficult to balance the reduction of the loss value and reduction of constraint violation. Although there are a few methods that solve this class of problems, all of them are double-loop or triple-loop algorithms, and they require oracles to solve some subproblems up to certain accuracy by tuning multiple hyperparameters at each iteration. In this paper, we propose a novel gradient descent and perturbed ascent (GDPA) algorithm to solve a class of smooth nonconvex inequality constrained problems. The GDPA is a primal-dual algorithm, which only exploits the first-order information of both the objective and constraint functions to update the primal and dual variables in an alternating way. The key feature of the proposed algorithm is that it is a single-loop algorithm, where only two step-sizes need to be tuned. We show that under a mild regularity condition GDPA is able to find Karush-Kuhn-Tucker (KKT) points of nonconvex functional constrained problems with convergence rate guarantees. To the best of our knowledge, it is the first single-loop algorithm that can solve the general nonconvex smooth problems with nonconvex inequality constraints. Numerical results also showcase the superiority of GDPA compared with the best-known algorithms (in terms of both stationarity measure and feasibility of the obtained solutions).}\n}", "pdf": "https://proceedings.mlr.press/v162/lu22a/lu22a.pdf", "supp": "", "pdf_size": 1117964, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12887981666859591732&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "IBM Research AI, IBM Thomas J. Watson Research Center, Yorktown Heights, New York 10598, USA", "aff_domain": "ibm.com", "email": "ibm.com", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/lu22a.html", "aff_unique_index": "0", "aff_unique_norm": "IBM", "aff_unique_dep": "IBM Research AI", "aff_unique_url": "https://www.ibm.com/research", "aff_unique_abbr": "IBM", "aff_campus_unique_index": "0", "aff_campus_unique": "Yorktown Heights", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "A State-Distribution Matching Approach to Non-Episodic Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18113", "id": "18113", "proceeding": "https://proceedings.mlr.press/v162/sharma22a.html", "poster": "/media/PosterPDFs/ICML%202022/41f860e3b7f548abc1f8b812059137bf.png?t=1657972238.131413", "slides": "", "author_site": "Archit Sharma, Rehaan Ahmad, Chelsea Finn", "author": "Archit Sharma; Rehaan Ahmad; Chelsea Finn", "abstract": "While reinforcement learning (RL) provides a framework for learning through trial and error, translating RL algorithms into the real world has remained challenging. A major hurdle to real-world application arises from the development of algorithms in an episodic setting where the environment is reset after every trial, in contrast with the continual and non-episodic nature of the real-world encountered by embodied agents such as humans and robots. Enabling agents to learn behaviors autonomously in such non-episodic environments requires that the agent to be able to conduct its own trials. Prior works have considered an alternating approach where a forward policy learns to solve the task and the backward policy learns to reset the environment, but what initial state distribution should the backward policy reset the agent to? Assuming access to a few demonstrations, we propose a new method, MEDAL, that trains the backward policy to match the state distribution in the provided demonstrations. This keeps the agent close to the task-relevant states, allowing for a mix of easy and difficult starting states for the forward policy. Our experiments show that MEDAL matches or outperforms prior methods on three sparse-reward continuous control tasks from the EARL benchmark, with 40% gains on the hardest task, while making fewer assumptions than prior works.", "bibtex": "@InProceedings{pmlr-v162-sharma22a,\n title = \t {A State-Distribution Matching Approach to Non-Episodic Reinforcement Learning},\n author = {Sharma, Archit and Ahmad, Rehaan and Finn, Chelsea},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19645--19657},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sharma22a/sharma22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sharma22a.html},\n abstract = \t {While reinforcement learning (RL) provides a framework for learning through trial and error, translating RL algorithms into the real world has remained challenging. A major hurdle to real-world application arises from the development of algorithms in an episodic setting where the environment is reset after every trial, in contrast with the continual and non-episodic nature of the real-world encountered by embodied agents such as humans and robots. Enabling agents to learn behaviors autonomously in such non-episodic environments requires that the agent to be able to conduct its own trials. Prior works have considered an alternating approach where a forward policy learns to solve the task and the backward policy learns to reset the environment, but what initial state distribution should the backward policy reset the agent to? Assuming access to a few demonstrations, we propose a new method, MEDAL, that trains the backward policy to match the state distribution in the provided demonstrations. This keeps the agent close to the task-relevant states, allowing for a mix of easy and difficult starting states for the forward policy. Our experiments show that MEDAL matches or outperforms prior methods on three sparse-reward continuous control tasks from the EARL benchmark, with 40% gains on the hardest task, while making fewer assumptions than prior works.}\n}", "pdf": "https://proceedings.mlr.press/v162/sharma22a/sharma22a.pdf", "supp": "", "pdf_size": 10182237, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14448955307324292158&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Stanford University; Stanford University; Stanford University", "aff_domain": "stanford.edu;stanford.edu; ", "email": "stanford.edu;stanford.edu; ", "github": "", "project": "https://sites.google.com/view/medal-arl/home", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sharma22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "A Statistical Manifold Framework for Point Cloud Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17931", "id": "17931", "proceeding": "https://proceedings.mlr.press/v162/lee22d.html", "poster": "/media/PosterPDFs/ICML%202022/11958dfee29b6709f48a9ba0387a2431.png?t=1657176040.4169993", "slides": "", "author_site": "Yonghyeon Lee, Seungyeon Kim, Jinwon Choi, Frank Chongwoo Park", "author": "Yonghyeon Lee; Seungyeon Kim; Jinwon Choi; Frank Park", "abstract": "Many problems in machine learning involve data sets in which each data point is a point cloud in $\\mathbb{R}^D$. A growing number of applications require a means of measuring not only distances between point clouds, but also angles, volumes, derivatives, and other more advanced concepts. To formulate and quantify these concepts in a coordinate-invariant way, we develop a Riemannian geometric framework for point cloud data. By interpreting each point in a point cloud as a sample drawn from some given underlying probability density, the space of point cloud data can be given the structure of a statistical manifold \u2013 each point on this manifold represents a point cloud \u2013 with the Fisher information metric acting as a natural Riemannian metric. Two autoencoder applications of our framework are presented: (i) smoothly deforming one 3D object into another via interpolation between the two corresponding point clouds; (ii) learning an optimal set of latent space coordinates for point cloud data that best preserves angles and distances, and thus produces a more discriminative representation space. Experiments with large-scale standard benchmark point cloud data show greatly improved classification accuracy vis-\u00e1-vis existing methods. Code is available at https://github.com/seungyeon-k/SMF-public.", "bibtex": "@InProceedings{pmlr-v162-lee22d,\n title = \t {A Statistical Manifold Framework for Point Cloud Data},\n author = {Lee, Yonghyeon and Kim, Seungyeon and Choi, Jinwon and Park, Frank},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12378--12402},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lee22d/lee22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/lee22d.html},\n abstract = \t {Many problems in machine learning involve data sets in which each data point is a point cloud in $\\mathbb{R}^D$. A growing number of applications require a means of measuring not only distances between point clouds, but also angles, volumes, derivatives, and other more advanced concepts. To formulate and quantify these concepts in a coordinate-invariant way, we develop a Riemannian geometric framework for point cloud data. By interpreting each point in a point cloud as a sample drawn from some given underlying probability density, the space of point cloud data can be given the structure of a statistical manifold \u2013 each point on this manifold represents a point cloud \u2013 with the Fisher information metric acting as a natural Riemannian metric. Two autoencoder applications of our framework are presented: (i) smoothly deforming one 3D object into another via interpolation between the two corresponding point clouds; (ii) learning an optimal set of latent space coordinates for point cloud data that best preserves angles and distances, and thus produces a more discriminative representation space. Experiments with large-scale standard benchmark point cloud data show greatly improved classification accuracy vis-\u00e1-vis existing methods. Code is available at https://github.com/seungyeon-k/SMF-public.}\n}", "pdf": "https://proceedings.mlr.press/v162/lee22d/lee22d.pdf", "supp": "", "pdf_size": 4655457, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10640933730705692613&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Mechanical Engineering, Seoul National University, Seoul, South Korea; Department of Mechanical Engineering, Seoul National University, Seoul, South Korea; Kakao Enterprise, Seongnam, Kyonggi-do, South Korea; Saige Research, Seoul, South Korea + Department of Mechanical Engineering, Seoul National University, Seoul, South Korea", "aff_domain": "snu.ac.kr; ; ;snu.ac.kr", "email": "snu.ac.kr; ; ;snu.ac.kr", "github": "https://github.com/seungyeon-k/SMF-public", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lee22d.html", "aff_unique_index": "0;0;1;2+0", "aff_unique_norm": "Seoul National University;Kakao Enterprise;Saige Research", "aff_unique_dep": "Department of Mechanical Engineering;;", "aff_unique_url": "https://www.snu.ac.kr;https://www.kakaoenterprise.com;", "aff_unique_abbr": "SNU;Kakao Enterprise;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0;0+0", "aff_country_unique": "South Korea" }, { "title": "A Stochastic Multi-Rate Control Framework For Modeling Distributed Optimization Algorithms", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16923", "id": "16923", "proceeding": "https://proceedings.mlr.press/v162/zhang22j.html", "poster": "/media/PosterPDFs/ICML%202022/384babc3e7faa44cf1ca671b74499c3b.png?t=1657735546.9814076", "slides": "", "author_site": "xinwei zhang, Mingyi Hong, Sairaj Dhople, Nicola Elia", "author": "Xinwei Zhang; Mingyi Hong; Sairaj Dhople; Nicola Elia", "abstract": "In modern machine learning systems, distributed algorithms are deployed across applications to ensure data privacy and optimal utilization of computational resources. This work offers a fresh perspective to model, analyze, and design distributed optimization algorithms through the lens of stochastic multi-rate feedback control. We show that a substantial class of distributed algorithms\u2014including popular Gradient Tracking for decentralized learning, and FedPD and Scaffold for federated learning\u2014can be modeled as a certain discrete-time stochastic feedback-control system, possibly with multiple sampling rates. This key observation allows us to develop a generic framework to analyze the convergence of the entire algorithm class. It also enables one to easily add desirable features such as differential privacy guarantees, or to deal with practical settings such as partial agent participation, communication compression, and imperfect communication in algorithm design and analysis.", "bibtex": "@InProceedings{pmlr-v162-zhang22j,\n title = \t {A Stochastic Multi-Rate Control Framework For Modeling Distributed Optimization Algorithms},\n author = {Zhang, Xinwei and Hong, Mingyi and Dhople, Sairaj and Elia, Nicola},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26206--26222},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22j/zhang22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22j.html},\n abstract = \t {In modern machine learning systems, distributed algorithms are deployed across applications to ensure data privacy and optimal utilization of computational resources. This work offers a fresh perspective to model, analyze, and design distributed optimization algorithms through the lens of stochastic multi-rate feedback control. We show that a substantial class of distributed algorithms\u2014including popular Gradient Tracking for decentralized learning, and FedPD and Scaffold for federated learning\u2014can be modeled as a certain discrete-time stochastic feedback-control system, possibly with multiple sampling rates. This key observation allows us to develop a generic framework to analyze the convergence of the entire algorithm class. It also enables one to easily add desirable features such as differential privacy guarantees, or to deal with practical settings such as partial agent participation, communication compression, and imperfect communication in algorithm design and analysis.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22j/zhang22j.pdf", "supp": "", "pdf_size": 612741, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5650658128391863575&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Electric and Computer Engineering, Minnesota University, MN, United States; Department of Electric and Computer Engineering, Minnesota University, MN, United States; Department of Electric and Computer Engineering, Minnesota University, MN, United States; Department of Electric and Computer Engineering, Minnesota University, MN, United States", "aff_domain": "umn.edu;umn.edu; ; ", "email": "umn.edu;umn.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhang22j.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Minnesota", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.umn.edu", "aff_unique_abbr": "UMN", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Minneapolis", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Study of Face Obfuscation in ImageNet", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16447", "id": "16447", "proceeding": "https://proceedings.mlr.press/v162/yang22q.html", "poster": "/media/PosterPDFs/ICML%202022/46922a0880a8f11f8f69cbb52b1396be.png?t=1657988967.5195725", "slides": "", "author_site": "Kaiyu Yang, Jacqueline Yau, Li Fei-Fei, Jia Deng, Olga Russakovsky", "author": "Kaiyu Yang; Jacqueline H. Yau; Li Fei-Fei; Jia Deng; Olga Russakovsky", "abstract": "Face obfuscation (blurring, mosaicing, etc.) has been shown to be effective for privacy protection; nevertheless, object recognition research typically assumes access to complete, unobfuscated images. In this paper, we explore the effects of face obfuscation on the popular ImageNet challenge visual recognition benchmark. Most categories in the ImageNet challenge are not people categories; however, many incidental people appear in the images, and their privacy is a concern. We first annotate faces in the dataset. Then we demonstrate that face obfuscation has minimal impact on the accuracy of recognition models. Concretely, we benchmark multiple deep neural networks on obfuscated images and observe that the overall recognition accuracy drops only slightly (<= 1.0%). Further, we experiment with transfer learning to 4 downstream tasks (object recognition, scene recognition, face attribute classification, and object detection) and show that features learned on obfuscated images are equally transferable. Our work demonstrates the feasibility of privacy-aware visual recognition, improves the highly-used ImageNet challenge benchmark, and suggests an important path for future visual datasets. Data and code are available at https://github.com/princetonvisualai/imagenet-face-obfuscation.", "bibtex": "@InProceedings{pmlr-v162-yang22q,\n title = \t {A Study of Face Obfuscation in {I}mage{N}et},\n author = {Yang, Kaiyu and Yau, Jacqueline H. and Fei-Fei, Li and Deng, Jia and Russakovsky, Olga},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25313--25330},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22q/yang22q.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22q.html},\n abstract = \t {Face obfuscation (blurring, mosaicing, etc.) has been shown to be effective for privacy protection; nevertheless, object recognition research typically assumes access to complete, unobfuscated images. In this paper, we explore the effects of face obfuscation on the popular ImageNet challenge visual recognition benchmark. Most categories in the ImageNet challenge are not people categories; however, many incidental people appear in the images, and their privacy is a concern. We first annotate faces in the dataset. Then we demonstrate that face obfuscation has minimal impact on the accuracy of recognition models. Concretely, we benchmark multiple deep neural networks on obfuscated images and observe that the overall recognition accuracy drops only slightly (<= 1.0%). Further, we experiment with transfer learning to 4 downstream tasks (object recognition, scene recognition, face attribute classification, and object detection) and show that features learned on obfuscated images are equally transferable. Our work demonstrates the feasibility of privacy-aware visual recognition, improves the highly-used ImageNet challenge benchmark, and suggests an important path for future visual datasets. Data and code are available at https://github.com/princetonvisualai/imagenet-face-obfuscation.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22q/yang22q.pdf", "supp": "", "pdf_size": 4292584, "gs_citation": 184, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18170664845630332563&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, Princeton University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Princeton University; Department of Computer Science, Princeton University", "aff_domain": "cs.princeton.edu; ; ;cs.princeton.edu; ", "email": "cs.princeton.edu; ; ;cs.princeton.edu; ", "github": "https://github.com/princetonvisualai/imagenet-face-obfuscation", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/yang22q.html", "aff_unique_index": "0;1;1;0;0", "aff_unique_norm": "Princeton University;Stanford University", "aff_unique_dep": "Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.princeton.edu;https://www.stanford.edu", "aff_unique_abbr": "Princeton;Stanford", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Study on the Ramanujan Graph Property of Winning Lottery Tickets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16785", "id": "16785", "proceeding": "https://proceedings.mlr.press/v162/pal22a.html", "poster": "/media/PosterPDFs/ICML%202022/618faa1728eb2ef6e3733645273ab145_WTrBe3r.png?t=1657783431.5461788", "slides": "", "author_site": "Bithika Pal, Arindam Biswas, Sudeshna Kolay, Pabitra Mitra, Biswajit Basu", "author": "Bithika Pal; Arindam Biswas; Sudeshna Kolay; Pabitra Mitra; Biswajit Basu", "abstract": "Winning lottery tickets refer to sparse subgraphs of deep neural networks which have classification accuracy close to the original dense networks. Resilient connectivity properties of such sparse networks play an important role in their performance. The attempt is to identify a sparse and yet well-connected network to guarantee unhindered information flow. Connectivity in a graph is best characterized by its spectral expansion property. Ramanujan graphs are robust expanders which lead to sparse but highly-connected networks, and thus aid in studying the winning tickets. A feedforward neural network consists of a sequence of bipartite graphs representing its layers. We analyze the Ramanujan graph property of such bipartite layers in terms of their spectral characteristics using the Cheeger\u2019s inequality for irregular graphs. It is empirically observed that the winning ticket networks preserve the Ramanujan graph property and achieve a high accuracy even when the layers are sparse. Accuracy and robustness to noise start declining as many of the layers lose the property. Next we find a robust winning lottery ticket by pruning individual layers while retaining their respective Ramanujan graph property. This strategy is observed to improve the performance of existing network pruning algorithms.", "bibtex": "@InProceedings{pmlr-v162-pal22a,\n title = \t {A Study on the Ramanujan Graph Property of Winning Lottery Tickets},\n author = {Pal, Bithika and Biswas, Arindam and Kolay, Sudeshna and Mitra, Pabitra and Basu, Biswajit},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17186--17201},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pal22a/pal22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pal22a.html},\n abstract = \t {Winning lottery tickets refer to sparse subgraphs of deep neural networks which have classification accuracy close to the original dense networks. Resilient connectivity properties of such sparse networks play an important role in their performance. The attempt is to identify a sparse and yet well-connected network to guarantee unhindered information flow. Connectivity in a graph is best characterized by its spectral expansion property. Ramanujan graphs are robust expanders which lead to sparse but highly-connected networks, and thus aid in studying the winning tickets. A feedforward neural network consists of a sequence of bipartite graphs representing its layers. We analyze the Ramanujan graph property of such bipartite layers in terms of their spectral characteristics using the Cheeger\u2019s inequality for irregular graphs. It is empirically observed that the winning ticket networks preserve the Ramanujan graph property and achieve a high accuracy even when the layers are sparse. Accuracy and robustness to noise start declining as many of the layers lose the property. Next we find a robust winning lottery ticket by pruning individual layers while retaining their respective Ramanujan graph property. This strategy is observed to improve the performance of existing network pruning algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/pal22a/pal22a.pdf", "supp": "", "pdf_size": 4763630, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12215591614929088872&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science and Engineering, Indian Institute of Technology, Kharagpur, India; Department of Mathematical Sciences, University of Copenhagen, Denmark; Department of Computer Science and Engineering, Indian Institute of Technology, Kharagpur, India; Department of Computer Science and Engineering, Indian Institute of Technology, Kharagpur, India; School of Civil, Structural and Environmental Engineering, Trinity College, Dublin, Ireland", "aff_domain": "gmail.com; ; ; ; ", "email": "gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/pal22a.html", "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "Indian Institute of Technology Kharagpur;University of Copenhagen;Trinity College Dublin", "aff_unique_dep": "Department of Computer Science and Engineering;Department of Mathematical Sciences;School of Civil, Structural and Environmental Engineering", "aff_unique_url": "https://www.iitkgp.ac.in;https://www.ku.dk;https://www.tcd.ie", "aff_unique_abbr": "IIT Kharagpur;UCPH;TCD", "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Kharagpur;;Dublin", "aff_country_unique_index": "0;1;0;0;2", "aff_country_unique": "India;Denmark;Ireland" }, { "title": "A Temporal-Difference Approach to Policy Gradient Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17449", "id": "17449", "proceeding": "https://proceedings.mlr.press/v162/tosatto22a.html", "poster": "/media/PosterPDFs/ICML%202022/f516dfb84b9051ed85b89cdc3a8ab7f5.png?t=1657213518.1625643", "slides": "", "author_site": "Samuele Tosatto, Andrew Patterson, Martha White, A. Mahmood", "author": "Samuele Tosatto; Andrew Patterson; Martha White; Rupam Mahmood", "abstract": "The policy gradient theorem (Sutton et al., 2000) prescribes the usage of a cumulative discounted state distribution under the target policy to approximate the gradient. Most algorithms based on this theorem, in practice, break this assumption, introducing a distribution shift that can cause the convergence to poor solutions. In this paper, we propose a new approach of reconstructing the policy gradient from the start state without requiring a particular sampling strategy. The policy gradient calculation in this form can be simplified in terms of a gradient critic, which can be recursively estimated due to a new Bellman equation of gradients. By using temporal-difference updates of the gradient critic from an off-policy data stream, we develop the first estimator that side-steps the distribution shift issue in a model-free way. We prove that, under certain realizability conditions, our estimator is unbiased regardless of the sampling strategy. We empirically show that our technique achieves a superior bias-variance trade-off and performance in presence of off-policy samples.", "bibtex": "@InProceedings{pmlr-v162-tosatto22a,\n title = \t {A Temporal-Difference Approach to Policy Gradient Estimation},\n author = {Tosatto, Samuele and Patterson, Andrew and White, Martha and Mahmood, Rupam},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21609--21632},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tosatto22a/tosatto22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tosatto22a.html},\n abstract = \t {The policy gradient theorem (Sutton et al., 2000) prescribes the usage of a cumulative discounted state distribution under the target policy to approximate the gradient. Most algorithms based on this theorem, in practice, break this assumption, introducing a distribution shift that can cause the convergence to poor solutions. In this paper, we propose a new approach of reconstructing the policy gradient from the start state without requiring a particular sampling strategy. The policy gradient calculation in this form can be simplified in terms of a gradient critic, which can be recursively estimated due to a new Bellman equation of gradients. By using temporal-difference updates of the gradient critic from an off-policy data stream, we develop the first estimator that side-steps the distribution shift issue in a model-free way. We prove that, under certain realizability conditions, our estimator is unbiased regardless of the sampling strategy. We empirically show that our technique achieves a superior bias-variance trade-off and performance in presence of off-policy samples.}\n}", "pdf": "https://proceedings.mlr.press/v162/tosatto22a/tosatto22a.pdf", "supp": "", "pdf_size": 997916, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12213929390329707477&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, University of Alberta, Edmonton, Canada+ CIFAR AI Chair, Alberta Machine Intelligence Institute (Amii); Department of Computer Science, University of Alberta, Edmonton, Canada+ CIFAR AI Chair, Alberta Machine Intelligence Institute (Amii); Department of Computer Science, University of Alberta, Edmonton, Canada+ CIFAR AI Chair, Alberta Machine Intelligence Institute (Amii); Department of Computer Science, University of Alberta, Edmonton, Canada+ CIFAR AI Chair, Alberta Machine Intelligence Institute (Amii)", "aff_domain": "ualberta.ca; ; ; ", "email": "ualberta.ca; ; ; ", "github": "https://github.com/SamuelePolimi/temporal-difference-gradient", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/tosatto22a.html", "aff_unique_index": "0+1;0+1;0+1;0+1", "aff_unique_norm": "University of Alberta;Alberta Machine Intelligence Institute", "aff_unique_dep": "Department of Computer Science;AI Chair", "aff_unique_url": "https://www.ualberta.ca;https://www.amii.ca", "aff_unique_abbr": "UAlberta;Amii", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Edmonton;", "aff_country_unique_index": "0+0;0+0;0+0;0+0", "aff_country_unique": "Canada" }, { "title": "A Theoretical Analysis on Independence-driven Importance Weighting for Covariate-shift Generalization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17631", "id": "17631", "proceeding": "https://proceedings.mlr.press/v162/xu22o.html", "poster": "/media/PosterPDFs/ICML%202022/a49e9411d64ff53eccfdd09ad10a15b3_i22HD2k.png?t=1657712232.0337641", "slides": "", "author_site": "Renzhe Xu, Xingxuan Zhang, Zheyan Shen, Tong Zhang, Peng Cui", "author": "Renzhe Xu; Xingxuan Zhang; Zheyan Shen; Tong Zhang; Peng Cui", "abstract": "Covariate-shift generalization, a typical case in out-of-distribution (OOD) generalization, requires a good performance on the unknown test distribution, which varies from the accessible training distribution in the form of covariate shift. Recently, independence-driven importance weighting algorithms in stable learning literature have shown empirical effectiveness to deal with covariate-shift generalization on several learning models, including regression algorithms and deep neural networks, while their theoretical analyses are missing. In this paper, we theoretically prove the effectiveness of such algorithms by explaining them as feature selection processes. We first specify a set of variables, named minimal stable variable set, that is the minimal and optimal set of variables to deal with covariate-shift generalization for common loss functions, such as the mean squared loss and binary cross-entropy loss. Afterward, we prove that under ideal conditions, independence-driven importance weighting algorithms could identify the variables in this set. Analysis of asymptotic properties is also provided. These theories are further validated in several synthetic experiments.", "bibtex": "@InProceedings{pmlr-v162-xu22o,\n title = \t {A Theoretical Analysis on Independence-driven Importance Weighting for Covariate-shift Generalization},\n author = {Xu, Renzhe and Zhang, Xingxuan and Shen, Zheyan and Zhang, Tong and Cui, Peng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24803--24829},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22o/xu22o.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22o.html},\n abstract = \t {Covariate-shift generalization, a typical case in out-of-distribution (OOD) generalization, requires a good performance on the unknown test distribution, which varies from the accessible training distribution in the form of covariate shift. Recently, independence-driven importance weighting algorithms in stable learning literature have shown empirical effectiveness to deal with covariate-shift generalization on several learning models, including regression algorithms and deep neural networks, while their theoretical analyses are missing. In this paper, we theoretically prove the effectiveness of such algorithms by explaining them as feature selection processes. We first specify a set of variables, named minimal stable variable set, that is the minimal and optimal set of variables to deal with covariate-shift generalization for common loss functions, such as the mean squared loss and binary cross-entropy loss. Afterward, we prove that under ideal conditions, independence-driven importance weighting algorithms could identify the variables in this set. Analysis of asymptotic properties is also provided. These theories are further validated in several synthetic experiments.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22o/xu22o.pdf", "supp": "", "pdf_size": 3153819, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14134137266916397351&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science and Technology, Tsinghua University, Beijing, China; Department of Computer Science and Technology, Tsinghua University, Beijing, China; Department of Computer Science and Technology, Tsinghua University, Beijing, China; Computer Science & Mathematics, The Hong Kong University of Science and Technology, Hong Kong, China; Department of Computer Science and Technology, Tsinghua University, Beijing, China", "aff_domain": "gmail.com;hotmail.com;mails.tsinghua.edu.cn;tongzhang-ml.org;tsinghua.edu.cn", "email": "gmail.com;hotmail.com;mails.tsinghua.edu.cn;tongzhang-ml.org;tsinghua.edu.cn", "github": "https://github.com/windxrz/independence-driven-IW", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/xu22o.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Tsinghua University;Hong Kong University of Science and Technology", "aff_unique_dep": "Department of Computer Science and Technology;Computer Science & Mathematics", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ust.hk", "aff_unique_abbr": "THU;HKUST", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Beijing;Hong Kong", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "A Theoretical Comparison of Graph Neural Network Extensions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16097", "id": "16097", "proceeding": "https://proceedings.mlr.press/v162/papp22a.html", "poster": "/media/PosterPDFs/ICML%202022/84e2d85ac232c681a641da1ec663888c.png?t=1657580732.0707457", "slides": "", "author_site": "P\u00e1l Andr\u00e1s Papp, Roger Wattenhofer", "author": "P\u00e1l Andr\u00e1s Papp; Roger Wattenhofer", "abstract": "We study and compare different Graph Neural Network extensions that increase the expressive power of GNNs beyond the Weisfeiler-Leman test. We focus on (i) GNNs based on higher order WL methods, (ii) GNNs that preprocess small substructures in the graph, (iii) GNNs that preprocess the graph up to a small radius, and (iv) GNNs that slightly perturb the graph to compute an embedding. We begin by presenting a simple improvement for this last extension that strictly increases the expressive power of this GNN variant. Then, as our main result, we compare the expressiveness of these extensions to each other through a series of example constructions that can be distinguished by one of the extensions, but not by another one. We also show negative examples that are particularly challenging for each of the extensions, and we prove several claims about the ability of these extensions to count cliques and cycles in the graph.", "bibtex": "@InProceedings{pmlr-v162-papp22a,\n title = \t {A Theoretical Comparison of Graph Neural Network Extensions},\n author = {Papp, P{\\'a}l Andr{\\'a}s and Wattenhofer, Roger},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17323--17345},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/papp22a/papp22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/papp22a.html},\n abstract = \t {We study and compare different Graph Neural Network extensions that increase the expressive power of GNNs beyond the Weisfeiler-Leman test. We focus on (i) GNNs based on higher order WL methods, (ii) GNNs that preprocess small substructures in the graph, (iii) GNNs that preprocess the graph up to a small radius, and (iv) GNNs that slightly perturb the graph to compute an embedding. We begin by presenting a simple improvement for this last extension that strictly increases the expressive power of this GNN variant. Then, as our main result, we compare the expressiveness of these extensions to each other through a series of example constructions that can be distinguished by one of the extensions, but not by another one. We also show negative examples that are particularly challenging for each of the extensions, and we prove several claims about the ability of these extensions to count cliques and cycles in the graph.}\n}", "pdf": "https://proceedings.mlr.press/v162/papp22a/papp22a.pdf", "supp": "", "pdf_size": 434820, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5399097345661932222&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Distributed Computing Group, ETH Z \u00a8urich, Z \u00a8urich, Switzerland; Distributed Computing Group, ETH Z \u00a8urich, Z \u00a8urich, Switzerland", "aff_domain": "ethz.ch;ethz.ch", "email": "ethz.ch;ethz.ch", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/papp22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Distributed Computing Group", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Z\u00fcrich", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "A Tighter Analysis of Spectral Clustering, and Beyond", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17963", "id": "17963", "proceeding": "https://proceedings.mlr.press/v162/macgregor22a.html", "poster": "", "slides": "", "author_site": "Peter Macgregor, He Sun", "author": "Peter Macgregor; He Sun", "abstract": "This work studies the classical spectral clustering algorithm which embeds the vertices of some graph G=(V_G, E_G) into R^k using k eigenvectors of some matrix of G, and applies k-means to partition V_G into k clusters. Our first result is a tighter analysis on the performance of spectral clustering, and explains why it works under some much weaker condition than the ones studied in the literature. For the second result, we show that, by applying fewer than k eigenvectors to construct the embedding, spectral clustering is able to produce better output for many practical instances; this result is the first of its kind in spectral clustering. Besides its conceptual and theoretical significance, the practical impact of our work is demonstrated by the empirical analysis on both synthetic and real-world data sets, in which spectral clustering produces comparable or better results with fewer than k eigenvectors.", "bibtex": "@InProceedings{pmlr-v162-macgregor22a,\n title = \t {A Tighter Analysis of Spectral Clustering, and Beyond},\n author = {Macgregor, Peter and Sun, He},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14717--14742},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/macgregor22a/macgregor22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/macgregor22a.html},\n abstract = \t {This work studies the classical spectral clustering algorithm which embeds the vertices of some graph G=(V_G, E_G) into R^k using k eigenvectors of some matrix of G, and applies k-means to partition V_G into k clusters. Our first result is a tighter analysis on the performance of spectral clustering, and explains why it works under some much weaker condition than the ones studied in the literature. For the second result, we show that, by applying fewer than k eigenvectors to construct the embedding, spectral clustering is able to produce better output for many practical instances; this result is the first of its kind in spectral clustering. Besides its conceptual and theoretical significance, the practical impact of our work is demonstrated by the empirical analysis on both synthetic and real-world data sets, in which spectral clustering produces comparable or better results with fewer than k eigenvectors.}\n}", "pdf": "https://proceedings.mlr.press/v162/macgregor22a/macgregor22a.pdf", "supp": "", "pdf_size": 2870372, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7116468291147711017&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "School of Informatics, University of Edinburgh, Edinburgh, United Kingdom; School of Informatics, University of Edinburgh, Edinburgh, United Kingdom", "aff_domain": "ed.ac.uk;ed.ac.uk", "email": "ed.ac.uk;ed.ac.uk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/macgregor22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "School of Informatics", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Edinburgh", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "A Tree-based Model Averaging Approach for Personalized Treatment Effect Estimation from Heterogeneous Data Sources", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16611", "id": "16611", "proceeding": "https://proceedings.mlr.press/v162/tan22a.html", "poster": "/media/PosterPDFs/ICML%202022/d77f00766fd3be3f2189c843a6af3fb2_oU1czDd.png?t=1657162616.2346144", "slides": "/media/icml-2022/Slides/16611.pdf", "author_site": "Xiaoqing (Ellen) Tan, Chung-Chou H. Chang, Ling Zhou, Lu Tang", "author": "Xiaoqing Tan; Chung-Chou H. Chang; Ling Zhou; Lu Tang", "abstract": "Accurately estimating personalized treatment effects within a study site (e.g., a hospital) has been challenging due to limited sample size. Furthermore, privacy considerations and lack of resources prevent a site from leveraging subject-level data from other sites. We propose a tree-based model averaging approach to improve the estimation accuracy of conditional average treatment effects (CATE) at a target site by leveraging models derived from other potentially heterogeneous sites, without them sharing subject-level data. To our best knowledge, there is no established model averaging approach for distributed data with a focus on improving the estimation of treatment effects. Specifically, under distributed data networks, our framework provides an interpretable tree-based ensemble of CATE estimators that joins models across study sites, while actively modeling the heterogeneity in data sources through site partitioning. The performance of this approach is demonstrated by a real-world study of the causal effects of oxygen therapy on hospital survival rate and backed up by comprehensive simulation results.", "bibtex": "@InProceedings{pmlr-v162-tan22a,\n title = \t {A Tree-based Model Averaging Approach for Personalized Treatment Effect Estimation from Heterogeneous Data Sources},\n author = {Tan, Xiaoqing and Chang, Chung-Chou H. and Zhou, Ling and Tang, Lu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21013--21036},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tan22a/tan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tan22a.html},\n abstract = \t {Accurately estimating personalized treatment effects within a study site (e.g., a hospital) has been challenging due to limited sample size. Furthermore, privacy considerations and lack of resources prevent a site from leveraging subject-level data from other sites. We propose a tree-based model averaging approach to improve the estimation accuracy of conditional average treatment effects (CATE) at a target site by leveraging models derived from other potentially heterogeneous sites, without them sharing subject-level data. To our best knowledge, there is no established model averaging approach for distributed data with a focus on improving the estimation of treatment effects. Specifically, under distributed data networks, our framework provides an interpretable tree-based ensemble of CATE estimators that joins models across study sites, while actively modeling the heterogeneity in data sources through site partitioning. The performance of this approach is demonstrated by a real-world study of the causal effects of oxygen therapy on hospital survival rate and backed up by comprehensive simulation results.}\n}", "pdf": "https://proceedings.mlr.press/v162/tan22a/tan22a.pdf", "supp": "", "pdf_size": 2175797, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=602189476639254582&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "University of Pittsburgh, Pittsburgh, PA, USA; University of Pittsburgh, Pittsburgh, PA, USA; Southwestern University of Finance and Economics, Chengdu, China; University of Pittsburgh, Pittsburgh, PA, USA", "aff_domain": "pitt.edu;pitt.edu;swufe.edu.cn;pitt.edu", "email": "pitt.edu;pitt.edu;swufe.edu.cn;pitt.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/tan22a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Pittsburgh;Southwestern University of Finance and Economics", "aff_unique_dep": ";", "aff_unique_url": "https://www.pitt.edu;https://www.swufe.edu.cn", "aff_unique_abbr": "Pitt;SWUFE", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Pittsburgh;Chengdu", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "A Unified View on PAC-Bayes Bounds for Meta-Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17497", "id": "17497", "proceeding": "https://proceedings.mlr.press/v162/rezazadeh22a.html", "poster": "/media/PosterPDFs/ICML%202022/d5a28f81834b6df2b6db6d3e5e2635c7_3nddO3S.png?t=1656586306.1003478", "slides": "/media/icml-2022/Slides/17497.pdf", "author": "Arezou Rezazadeh", "abstract": "Meta learning automatically infers an inductive bias, that includes the hyperparameter of the baselearning algorithm, by observing data from a finite number of related tasks. This paper studies PAC-Bayes bounds on meta generalization gap. The meta-generalization gap comprises two sources of generalization gaps: the environmentlevel and task-level gaps resulting from observation of a finite number of tasks and data samples per task, respectively. In this paper, by upper bounding arbitrary convex functions, which link the expected and empirical losses at the environment and also per-task levels, we obtain new PAC-Bayes bounds. Using these bounds, we develop new PAC-Bayes meta-learning algorithms. Numerical examples demonstrate the merits of the proposed novel bounds and algorithm in comparison to prior PAC-Bayes bounds for meta-learning", "bibtex": "@InProceedings{pmlr-v162-rezazadeh22a,\n title = \t {A Unified View on {PAC}-{B}ayes Bounds for Meta-Learning},\n author = {Rezazadeh, Arezou},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18576--18595},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rezazadeh22a/rezazadeh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rezazadeh22a.html},\n abstract = \t {Meta learning automatically infers an inductive bias, that includes the hyperparameter of the baselearning algorithm, by observing data from a finite number of related tasks. This paper studies PAC-Bayes bounds on meta generalization gap. The meta-generalization gap comprises two sources of generalization gaps: the environmentlevel and task-level gaps resulting from observation of a finite number of tasks and data samples per task, respectively. In this paper, by upper bounding arbitrary convex functions, which link the expected and empirical losses at the environment and also per-task levels, we obtain new PAC-Bayes bounds. Using these bounds, we develop new PAC-Bayes meta-learning algorithms. Numerical examples demonstrate the merits of the proposed novel bounds and algorithm in comparison to prior PAC-Bayes bounds for meta-learning}\n}", "pdf": "https://proceedings.mlr.press/v162/rezazadeh22a/rezazadeh22a.pdf", "supp": "", "pdf_size": 414217, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3133766927310411857&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Electrical Engineering, Chalmers University of Technology, Gothenburg, Sweden", "aff_domain": "chalmers.se", "email": "chalmers.se", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/rezazadeh22a.html", "aff_unique_index": "0", "aff_unique_norm": "Chalmers University of Technology", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.chalmers.se", "aff_unique_abbr": "Chalmers", "aff_campus_unique_index": "0", "aff_campus_unique": "Gothenburg", "aff_country_unique_index": "0", "aff_country_unique": "Sweden" }, { "title": "A Unified Weight Initialization Paradigm for Tensorial Convolutional Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18057", "id": "18057", "proceeding": "https://proceedings.mlr.press/v162/pan22b.html", "poster": "", "slides": "", "author_site": "Yu Pan, Zeyong Su, Ao Liu, Jingquan Wang, Nannan Li, ZENGLIN Xu", "author": "Yu Pan; Zeyong Su; Ao Liu; Wang Jingquan; Nannan Li; Zenglin Xu", "abstract": "Tensorial Convolutional Neural Networks (TCNNs) have attracted much research attention for their power in reducing model parameters or enhancing the generalization ability. However, exploration of TCNNs is hindered even from weight initialization methods. To be specific, general initialization methods, such as Xavier or Kaiming initialization, usually fail to generate appropriate weights for TCNNs. Meanwhile, although there are ad-hoc approaches for specific architectures (e.g., Tensor Ring Nets), they are not applicable to TCNNs with other tensor decomposition methods (e.g., CP or Tucker decomposition). To address this problem, we propose a universal weight initialization paradigm, which generalizes Xavier and Kaiming methods and can be widely applicable to arbitrary TCNNs. Specifically, we first present the Reproducing Transformation to convert the backward process in TCNNs to an equivalent convolution process. Then, based on the convolution operators in the forward and backward processes, we build a unified paradigm to control the variance of features and gradients in TCNNs. Thus, we can derive fan-in and fan-out initialization for various TCNNs. We demonstrate that our paradigm can stabilize the training of TCNNs, leading to faster convergence and better results.", "bibtex": "@InProceedings{pmlr-v162-pan22b,\n title = \t {A Unified Weight Initialization Paradigm for Tensorial Convolutional Neural Networks},\n author = {Pan, Yu and Su, Zeyong and Liu, Ao and Jingquan, Wang and Li, Nannan and Xu, Zenglin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17238--17257},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pan22b/pan22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/pan22b.html},\n abstract = \t {Tensorial Convolutional Neural Networks (TCNNs) have attracted much research attention for their power in reducing model parameters or enhancing the generalization ability. However, exploration of TCNNs is hindered even from weight initialization methods. To be specific, general initialization methods, such as Xavier or Kaiming initialization, usually fail to generate appropriate weights for TCNNs. Meanwhile, although there are ad-hoc approaches for specific architectures (e.g., Tensor Ring Nets), they are not applicable to TCNNs with other tensor decomposition methods (e.g., CP or Tucker decomposition). To address this problem, we propose a universal weight initialization paradigm, which generalizes Xavier and Kaiming methods and can be widely applicable to arbitrary TCNNs. Specifically, we first present the Reproducing Transformation to convert the backward process in TCNNs to an equivalent convolution process. Then, based on the convolution operators in the forward and backward processes, we build a unified paradigm to control the variance of features and gradients in TCNNs. Thus, we can derive fan-in and fan-out initialization for various TCNNs. We demonstrate that our paradigm can stabilize the training of TCNNs, leading to faster convergence and better results.}\n}", "pdf": "https://proceedings.mlr.press/v162/pan22b/pan22b.pdf", "supp": "", "pdf_size": 2369152, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2601266852558996821&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/pan22b.html" }, { "title": "A data-driven approach for learning to control computers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18071", "id": "18071", "proceeding": "https://proceedings.mlr.press/v162/humphreys22a.html", "poster": "", "slides": "", "author_site": "Peter Humphreys, David Raposo, Tobias Pohlen, Gregory Thornton, Rachita Chhaparia, Alistair Muldal, Josh Abramson, Petko Georgiev, Adam Santoro, Timothy Lillicrap", "author": "Peter C Humphreys; David Raposo; Tobias Pohlen; Gregory Thornton; Rachita Chhaparia; Alistair Muldal; Josh Abramson; Petko Georgiev; Adam Santoro; Timothy Lillicrap", "abstract": "It would be useful for machines to use computers as humans do so that they can aid us in everyday tasks. This is a setting in which there is also the potential to leverage large-scale expert demonstrations and human judgements of interactive behaviour, which are two ingredients that have driven much recent success in AI. Here we investigate the setting of computer control using keyboard and mouse, with goals specified via natural language. Instead of focusing on hand-designed curricula and specialized action spaces, we focus on developing a scalable method centered on reinforcement learning combined with behavioural priors informed by actual human-computer interactions. We achieve state-of-the-art and human-level mean performance across all tasks within the MiniWob++ benchmark, a challenging suite of computer control problems, and find strong evidence of cross-task transfer. These results demonstrate the usefulness of a unified human-agent interface when training machines to use computers. Altogether our results suggest a formula for achieving competency beyond MiniWob++ and towards controlling computers, in general, as a human would.", "bibtex": "@InProceedings{pmlr-v162-humphreys22a,\n title = \t {A data-driven approach for learning to control computers},\n author = {Humphreys, Peter C and Raposo, David and Pohlen, Tobias and Thornton, Gregory and Chhaparia, Rachita and Muldal, Alistair and Abramson, Josh and Georgiev, Petko and Santoro, Adam and Lillicrap, Timothy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9466--9482},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/humphreys22a/humphreys22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/humphreys22a.html},\n abstract = \t {It would be useful for machines to use computers as humans do so that they can aid us in everyday tasks. This is a setting in which there is also the potential to leverage large-scale expert demonstrations and human judgements of interactive behaviour, which are two ingredients that have driven much recent success in AI. Here we investigate the setting of computer control using keyboard and mouse, with goals specified via natural language. Instead of focusing on hand-designed curricula and specialized action spaces, we focus on developing a scalable method centered on reinforcement learning combined with behavioural priors informed by actual human-computer interactions. We achieve state-of-the-art and human-level mean performance across all tasks within the MiniWob++ benchmark, a challenging suite of computer control problems, and find strong evidence of cross-task transfer. These results demonstrate the usefulness of a unified human-agent interface when training machines to use computers. Altogether our results suggest a formula for achieving competency beyond MiniWob++ and towards controlling computers, in general, as a human would.}\n}", "pdf": "https://proceedings.mlr.press/v162/humphreys22a/humphreys22a.pdf", "supp": "", "pdf_size": 4910631, "gs_citation": 119, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17602375057296918760&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "DeepMind, London, United Kingdom; DeepMind, London, United Kingdom; DeepMind, London, United Kingdom; DeepMind, London, United Kingdom; DeepMind, London, United Kingdom; DeepMind, London, United Kingdom; DeepMind, London, United Kingdom; DeepMind, London, United Kingdom; DeepMind, London, United Kingdom; DeepMind, London, United Kingdom", "aff_domain": "deepmind.com; ; ; ; ; ; ; ; ;deepmind.com", "email": "deepmind.com; ; ; ; ; ; ; ; ;deepmind.com", "github": "", "project": "", "author_num": 10, "oa": "https://proceedings.mlr.press/v162/humphreys22a.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "A deep convolutional neural network that is invariant to time rescaling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16767", "id": "16767", "proceeding": "https://proceedings.mlr.press/v162/jacques22a.html", "poster": "/media/PosterPDFs/ICML%202022/782086acbe9f48126642e093bf6ba151.png?t=1658069407.1571589", "slides": "", "author_site": "Brandon G Jacques, Zoran Tiganj, Aakash Sarkar, Marc Howard, Per Sederberg", "author": "Brandon G Jacques; Zoran Tiganj; Aakash Sarkar; Marc Howard; Per Sederberg", "abstract": "Human learners can readily understand speech, or a melody, when it is presented slower or faster than usual. This paper presents a deep CNN (SITHCon) that uses a logarithmically compressed temporal representation at each level. Because rescaling the time of the input results in a translation of $\\log$ time, and because the output of the convolution is invariant to translations, this network can generalize to out-of-sample data that are temporal rescalings of a learned pattern. We compare the performance of SITHCon to a Temporal Convolution Network (TCN) on classification and regression problems with both univariate and multivariate time series. We find that SITHCon, unlike TCN, generalizes robustly over rescalings of about an order of magnitude. Moreover, we show that the network can generalize over exponentially large scales without retraining the weights simply by extending the range of the logarithmically-compressed temporal memory.", "bibtex": "@InProceedings{pmlr-v162-jacques22a,\n title = \t {A deep convolutional neural network that is invariant to time rescaling},\n author = {Jacques, Brandon G and Tiganj, Zoran and Sarkar, Aakash and Howard, Marc and Sederberg, Per},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9729--9738},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jacques22a/jacques22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jacques22a.html},\n abstract = \t {Human learners can readily understand speech, or a melody, when it is presented slower or faster than usual. This paper presents a deep CNN (SITHCon) that uses a logarithmically compressed temporal representation at each level. Because rescaling the time of the input results in a translation of $\\log$ time, and because the output of the convolution is invariant to translations, this network can generalize to out-of-sample data that are temporal rescalings of a learned pattern. We compare the performance of SITHCon to a Temporal Convolution Network (TCN) on classification and regression problems with both univariate and multivariate time series. We find that SITHCon, unlike TCN, generalizes robustly over rescalings of about an order of magnitude. Moreover, we show that the network can generalize over exponentially large scales without retraining the weights simply by extending the range of the logarithmically-compressed temporal memory.}\n}", "pdf": "https://proceedings.mlr.press/v162/jacques22a/jacques22a.pdf", "supp": "", "pdf_size": 2637066, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=731774651536846779&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Psychology, University of Virginia, Charlottesville, VA, United States; Department of Computer Science, Indiana University, Bloomington, IN, United States; Department of Psychological and Brain Sciences, Boston University, Boston, MA, United States; Department of Psychological and Brain Sciences, Boston University, Boston, MA, United States; Department of Psychology, University of Virginia, Charlottesville, VA, United States", "aff_domain": "virginia.edu; ; ; ;virginia.edu", "email": "virginia.edu; ; ; ;virginia.edu", "github": "", "project": "https://rc.virginia.edu", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/jacques22a.html", "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "University of Virginia;Indiana University;Boston University", "aff_unique_dep": "Department of Psychology;Department of Computer Science;Department of Psychological and Brain Sciences", "aff_unique_url": "https://www.virginia.edu;https://www.indiana.edu;https://www.bu.edu", "aff_unique_abbr": "UVA;IU;BU", "aff_campus_unique_index": "0;1;2;2;0", "aff_campus_unique": "Charlottesville;Bloomington;Boston", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "A new similarity measure for covariate shift with applications to nonparametric regression", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17403", "id": "17403", "proceeding": "https://proceedings.mlr.press/v162/pathak22a.html", "poster": "/media/PosterPDFs/ICML%202022/a424ded436368e3f9f10da14c23acc85.png?t=1657554472.9572663", "slides": "", "author_site": "Reese Pathak, Cong Ma, Martin Wainwright", "author": "Reese Pathak; Cong Ma; Martin Wainwright", "abstract": "We study covariate shift in the context of nonparametric regression. We introduce a new measure of distribution mismatch between the source and target distributions using the integrated ratio of probabilities of balls at a given radius. We use the scaling of this measure with respect to the radius to characterize the minimax rate of estimation over a family of H{\u00f6}lder continuous functions under covariate shift. In comparison to the recently proposed notion of transfer exponent, this measure leads to a sharper rate of convergence and is more fine-grained. We accompany our theory with concrete instances of covariate shift that illustrate this sharp difference.", "bibtex": "@InProceedings{pmlr-v162-pathak22a,\n title = \t {A new similarity measure for covariate shift with applications to nonparametric regression},\n author = {Pathak, Reese and Ma, Cong and Wainwright, Martin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17517--17530},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pathak22a/pathak22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pathak22a.html},\n abstract = \t {We study covariate shift in the context of nonparametric regression. We introduce a new measure of distribution mismatch between the source and target distributions using the integrated ratio of probabilities of balls at a given radius. We use the scaling of this measure with respect to the radius to characterize the minimax rate of estimation over a family of H{\u00f6}lder continuous functions under covariate shift. In comparison to the recently proposed notion of transfer exponent, this measure leads to a sharper rate of convergence and is more fine-grained. We accompany our theory with concrete instances of covariate shift that illustrate this sharp difference.}\n}", "pdf": "https://proceedings.mlr.press/v162/pathak22a/pathak22a.pdf", "supp": "", "pdf_size": 1862992, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18375030170443407678&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Electrical Engineering and Computer Sciences, University of California, Berkeley+Department of Statistics, University of California, Berkeley; Department of Statistics, University of Chicago; Department of Statistics, University of California, Berkeley", "aff_domain": "berkeley.edu; ; ", "email": "berkeley.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/pathak22a.html", "aff_unique_index": "0+0;1;0", "aff_unique_norm": "University of California, Berkeley;University of Chicago", "aff_unique_dep": "Department of Electrical Engineering and Computer Sciences;Department of Statistics", "aff_unique_url": "https://www.berkeley.edu;https://www.uchicago.edu", "aff_unique_abbr": "UC Berkeley;UChicago", "aff_campus_unique_index": "0+0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "A query-optimal algorithm for finding counterfactuals", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16451", "id": "16451", "proceeding": "https://proceedings.mlr.press/v162/blanc22a.html", "poster": "/media/PosterPDFs/ICML%202022/acf06cdd9c744f969958e1f085554c8b.png?t=1658092315.512576", "slides": "", "author_site": "Guy Blanc, Caleb Koch, Jane Lange, Li-Yang Tan", "author": "Guy Blanc; Caleb Koch; Jane Lange; Li-Yang Tan", "abstract": "We design an algorithm for finding counterfactuals with strong theoretical guarantees on its performance. For any monotone model $f : X^d \\to \\{0,1\\}$ and instance $x^\\star$, our algorithm makes \\[{S}(f)^{O(\\Delta_f(x^\\star))}\\cdot \\log d\\]{queries} to $f$ and returns an {\\sl optimal} counterfactual for $x^\\star$: a nearest instance $x\u2019$ to $x^\\star$ for which $f(x\u2019)\\ne f(x^\\star)$. Here $S(f)$ is the sensitivity of $f$, a discrete analogue of the Lipschitz constant, and $\\Delta_f(x^\\star)$ is the distance from $x^\\star$ to its nearest counterfactuals. The previous best known query complexity was $d^{\\,O(\\Delta_f(x^\\star))}$, achievable by brute-force local search. We further prove a lower bound of $S(f)^{\\Omega(\\Delta_f(x^\\star))} + \\Omega(\\log d)$ on the query complexity of any algorithm, thereby showing that the guarantees of our algorithm are essentially optimal.", "bibtex": "@InProceedings{pmlr-v162-blanc22a,\n title = \t {A query-optimal algorithm for finding counterfactuals},\n author = {Blanc, Guy and Koch, Caleb and Lange, Jane and Tan, Li-Yang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2075--2090},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/blanc22a/blanc22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/blanc22a.html},\n abstract = \t {We design an algorithm for finding counterfactuals with strong theoretical guarantees on its performance. For any monotone model $f : X^d \\to \\{0,1\\}$ and instance $x^\\star$, our algorithm makes \\[{S}(f)^{O(\\Delta_f(x^\\star))}\\cdot \\log d\\]{queries} to $f$ and returns an {\\sl optimal} counterfactual for $x^\\star$: a nearest instance $x\u2019$ to $x^\\star$ for which $f(x\u2019)\\ne f(x^\\star)$. Here $S(f)$ is the sensitivity of $f$, a discrete analogue of the Lipschitz constant, and $\\Delta_f(x^\\star)$ is the distance from $x^\\star$ to its nearest counterfactuals. The previous best known query complexity was $d^{\\,O(\\Delta_f(x^\\star))}$, achievable by brute-force local search. We further prove a lower bound of $S(f)^{\\Omega(\\Delta_f(x^\\star))} + \\Omega(\\log d)$ on the query complexity of any algorithm, thereby showing that the guarantees of our algorithm are essentially optimal.}\n}", "pdf": "https://proceedings.mlr.press/v162/blanc22a/blanc22a.pdf", "supp": "", "pdf_size": 358669, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13003004863217350403&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Massachusetts Institute of Technology; Department of Computer Science, Stanford University", "aff_domain": "stanford.edu;stanford.edu;mit.edu;cs.stanford.edu", "email": "stanford.edu;stanford.edu;mit.edu;cs.stanford.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/blanc22a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Stanford University;Massachusetts Institute of Technology", "aff_unique_dep": "Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.stanford.edu;https://web.mit.edu", "aff_unique_abbr": "Stanford;MIT", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Stanford;Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "A$^3$T: Alignment-Aware Acoustic and Text Pretraining for Speech Synthesis and Editing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17597", "id": "17597", "proceeding": "https://proceedings.mlr.press/v162/bai22d.html", "poster": "/media/PosterPDFs/ICML%202022/2794f6a20ee0685f4006210f40799acd_qmu2ivW.png?t=1657418962.8758461", "slides": "", "author_site": "He Bai, Renjie Zheng, Junkun Chen, Mingbo Ma, Xintong Li, Liang Huang", "author": "He Bai; Renjie Zheng; Junkun Chen; Mingbo Ma; Xintong Li; Liang Huang", "abstract": "Recently, speech representation learning has improved many speech-related tasks such as speech recognition, speech classification, and speech-to-text translation. However, all the above tasks are in the direction of speech understanding, but for the inverse direction, speech synthesis, the potential of representation learning is yet to be realized, due to the challenging nature of generating high-quality speech. To address this problem, we propose our framework, Alignment-Aware Acoustic-Text Pretraining (A$^3$T), which reconstructs masked acoustic signals with text input and acoustic-text alignment during training. In this way, the pretrained model can generate high quality reconstructed spectrogram, which can be applied to the speech editing and unseen speaker TTS directly. Experiments show A$^3$T outperforms SOTA models on speech editing, and improves multi-speaker speech synthesis without the external speaker verification model.", "bibtex": "@InProceedings{pmlr-v162-bai22d,\n title = \t {{A}$^3${T}: Alignment-Aware Acoustic and Text Pretraining for Speech Synthesis and Editing},\n author = {Bai, He and Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Li, Xintong and Huang, Liang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1399--1411},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bai22d/bai22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/bai22d.html},\n abstract = \t {Recently, speech representation learning has improved many speech-related tasks such as speech recognition, speech classification, and speech-to-text translation. However, all the above tasks are in the direction of speech understanding, but for the inverse direction, speech synthesis, the potential of representation learning is yet to be realized, due to the challenging nature of generating high-quality speech. To address this problem, we propose our framework, Alignment-Aware Acoustic-Text Pretraining (A$^3$T), which reconstructs masked acoustic signals with text input and acoustic-text alignment during training. In this way, the pretrained model can generate high quality reconstructed spectrogram, which can be applied to the speech editing and unseen speaker TTS directly. Experiments show A$^3$T outperforms SOTA models on speech editing, and improves multi-speaker speech synthesis without the external speaker verification model.}\n}", "pdf": "https://proceedings.mlr.press/v162/bai22d/bai22d.pdf", "supp": "", "pdf_size": 5110300, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17582413071474452331&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "University of Waterloo, Waterloo, ON, Canada (work done at Baidu Research USA); Baidu Research, Sunnyvale, CA, USA; Oregon State University, Corvallis, OR, USA; Baidu Research, Sunnyvale, CA, USA; Baidu Research, Sunnyvale, CA, USA; Oregon State University, Corvallis, OR, USA", "aff_domain": "gmail.com;gmail.com; ; ; ;", "email": "gmail.com;gmail.com; ; ; ;", "github": "https://github.com/richardbaihe/a3t", "project": "https://educated-toothpaste-462.notion.site/Demo-b0edd300e6004c508744c6259369a468", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/bai22d.html", "aff_unique_index": "0;1;2;1;1;2", "aff_unique_norm": "University of Waterloo;Baidu;Oregon State University", "aff_unique_dep": ";Research;", "aff_unique_url": "https://uwaterloo.ca;https://research.baidu.com;https://oregonstate.edu", "aff_unique_abbr": "UW;Baidu Res.;OSU", "aff_campus_unique_index": "0;1;2;1;1;2", "aff_campus_unique": "Waterloo;Sunnyvale;Corvallis", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "Canada;United States" }, { "title": "AGNAS: Attention-Guided Micro and Macro-Architecture Search", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18313", "id": "18313", "proceeding": "https://proceedings.mlr.press/v162/sun22a.html", "poster": "/media/PosterPDFs/ICML%202022/ed57844fa5e051809ead5aa7e3e1d555.png?t=1658231948.5338814", "slides": "", "author_site": "Zihao Sun, Yu Hu, Shun Lu, Longxing Yang, Jilin Mei, Yinhe Han, Xiaowei Li", "author": "Zihao Sun; Yu Hu; Shun Lu; Longxing Yang; Jilin Mei; Yinhe Han; Xiaowei Li", "abstract": "Micro- and macro-architecture search have emerged as two popular NAS paradigms recently. Existing methods leverage different search strategies for searching micro- and macro- architectures. When using architecture parameters to search for micro-structure such as normal cell and reduction cell, the architecture parameters can not fully reflect the corresponding operation importance. When searching for the macro-structure chained by pre-defined blocks, many sub-networks need to be sampled for evaluation, which is very time-consuming. To address the two issues, we propose a new search paradigm, that is, leverage the attention mechanism to guide the micro- and macro-architecture search, namely AGNAS. Specifically, we introduce an attention module and plug it behind each candidate operation or each candidate block. We utilize the attention weights to represent the importance of the relevant operations for the micro search or the importance of the relevant blocks for the macro search. Experimental results show that AGNAS can achieve 2.46% test error on CIFAR-10 in the DARTS search space, and 23.4% test error when directly searching on ImageNet in the ProxylessNAS search space. AGNAS also achieves optimal performance on NAS-Bench-201, outperforming state-of-the-art approaches. The source code can be available at https://github.com/Sunzh1996/AGNAS.", "bibtex": "@InProceedings{pmlr-v162-sun22a,\n title = \t {{AGNAS}: Attention-Guided Micro and Macro-Architecture Search},\n author = {Sun, Zihao and Hu, Yu and Lu, Shun and Yang, Longxing and Mei, Jilin and Han, Yinhe and Li, Xiaowei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20777--20789},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sun22a/sun22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sun22a.html},\n abstract = \t {Micro- and macro-architecture search have emerged as two popular NAS paradigms recently. Existing methods leverage different search strategies for searching micro- and macro- architectures. When using architecture parameters to search for micro-structure such as normal cell and reduction cell, the architecture parameters can not fully reflect the corresponding operation importance. When searching for the macro-structure chained by pre-defined blocks, many sub-networks need to be sampled for evaluation, which is very time-consuming. To address the two issues, we propose a new search paradigm, that is, leverage the attention mechanism to guide the micro- and macro-architecture search, namely AGNAS. Specifically, we introduce an attention module and plug it behind each candidate operation or each candidate block. We utilize the attention weights to represent the importance of the relevant operations for the micro search or the importance of the relevant blocks for the macro search. Experimental results show that AGNAS can achieve 2.46% test error on CIFAR-10 in the DARTS search space, and 23.4% test error when directly searching on ImageNet in the ProxylessNAS search space. AGNAS also achieves optimal performance on NAS-Bench-201, outperforming state-of-the-art approaches. The source code can be available at https://github.com/Sunzh1996/AGNAS.}\n}", "pdf": "https://proceedings.mlr.press/v162/sun22a/sun22a.pdf", "supp": "", "pdf_size": 1055823, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17131789715664544080&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "https://github.com/Sunzh1996/AGNAS", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/sun22a.html" }, { "title": "ASAP.SGD: Instance-based Adaptiveness to Staleness in Asynchronous SGD", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17481", "id": "17481", "proceeding": "https://proceedings.mlr.press/v162/backstrom22a.html", "poster": "/media/PosterPDFs/ICML%202022/69c7e73fea7ad35e9000ce41e1622036.png?t=1656760178.8179922", "slides": "/media/icml-2022/Slides/17481.pdf", "author_site": "Karl B\u00e4ckstr\u00f6m, Marina Papatriantafilou, Philippas Tsigas", "author": "Karl B\u00e4ckstr\u00f6m; Marina Papatriantafilou; Philippas Tsigas", "abstract": "Concurrent algorithmic implementations of Stochastic Gradient Descent (SGD) give rise to critical questions for compute-intensive Machine Learning (ML). Asynchrony implies speedup in some contexts, and challenges in others, as stale updates may lead to slower, or non-converging executions. While previous works showed asynchrony-adaptiveness can improve stability and speedup by reducing the step size for stale updates according to static rules, there is no one-size-fits-all adaptation rule, since the optimal strategy depends on several factors. We introduce (i)\u00a0$\\mathtt{ASAP.SGD}$, an analytical framework capturing necessary and desired properties of staleness-adaptive step size functions and (ii)\u00a0\\textsc{tail}-$\\tau$, a method for utilizing key properties of the", "bibtex": "@InProceedings{pmlr-v162-backstrom22a,\n title = \t {{ASAP}.{SGD}: Instance-based Adaptiveness to Staleness in Asynchronous {SGD}},\n author = {B{\\\"a}ckstr{\\\"o}m, Karl and Papatriantafilou, Marina and Tsigas, Philippas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1261--1276},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/backstrom22a/backstrom22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/backstrom22a.html},\n abstract = \t {Concurrent algorithmic implementations of Stochastic Gradient Descent (SGD) give rise to critical questions for compute-intensive Machine Learning (ML). Asynchrony implies speedup in some contexts, and challenges in others, as stale updates may lead to slower, or non-converging executions. While previous works showed asynchrony-adaptiveness can improve stability and speedup by reducing the step size for stale updates according to static rules, there is no one-size-fits-all adaptation rule, since the optimal strategy depends on several factors. We introduce (i)\u00a0$\\mathtt{ASAP.SGD}$, an analytical framework capturing necessary and desired properties of staleness-adaptive step size functions and (ii)\u00a0\\textsc{tail}-$\\tau$, a method for utilizing key properties of the", "pdf": "https://proceedings.mlr.press/v162/backstrom22a/backstrom22a.pdf", "supp": "", "pdf_size": 1206646, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7742475634801252786&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science and Engineering, Chalmers University of Technology, Gothenburg, Sweden; Department of Computer Science and Engineering, Chalmers University of Technology, Gothenburg, Sweden; Department of Computer Science and Engineering, Chalmers University of Technology, Gothenburg, Sweden", "aff_domain": "chalmers.se; ; ", "email": "chalmers.se; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/backstrom22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Chalmers University of Technology", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.chalmers.se", "aff_unique_abbr": "Chalmers", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Gothenburg", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Sweden" }, { "title": "Accelerated Federated Learning with Decoupled Adaptive Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17539", "id": "17539", "proceeding": "https://proceedings.mlr.press/v162/jin22e.html", "poster": "/media/PosterPDFs/ICML%202022/31a6b5568bbb9c350c1b296d9086acf0.png?t=1657307760.4487631", "slides": "", "author_site": "Jiayin Jin, Jiaxiang Ren, Yang Zhou, Lingjuan Lyu, Ji Liu, Dejing Dou", "author": "Jiayin Jin; Jiaxiang Ren; Yang Zhou; Lingjuan Lyu; Ji Liu; Dejing Dou", "abstract": "The federated learning (FL) framework enables edge clients to collaboratively learn a shared inference model while keeping privacy of training data on clients. Recently, many heuristics efforts have been made to generalize centralized adaptive optimization methods, such as SGDM, Adam, AdaGrad, etc., to federated settings for improving convergence and accuracy. However, there is still a paucity of theoretical principles on where to and how to design and utilize adaptive optimization methods in federated settings. This work aims to develop novel adaptive optimization methods for FL from the perspective of dynamics of ordinary differential equations (ODEs). First, an analytic framework is established to build a connection between federated optimization methods and decompositions of ODEs of corresponding centralized optimizers. Second, based on this analytic framework, a momentum decoupling adaptive optimization method, FedDA, is developed to fully utilize the global momentum on each local iteration and accelerate the training convergence. Last but not least, full batch gradients are utilized to mimic centralized optimization in the end of the training process to ensure the convergence and overcome the possible inconsistency caused by adaptive optimization methods.", "bibtex": "@InProceedings{pmlr-v162-jin22e,\n title = \t {Accelerated Federated Learning with Decoupled Adaptive Optimization},\n author = {Jin, Jiayin and Ren, Jiaxiang and Zhou, Yang and Lyu, Lingjuan and Liu, Ji and Dou, Dejing},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10298--10322},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jin22e/jin22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/jin22e.html},\n abstract = \t {The federated learning (FL) framework enables edge clients to collaboratively learn a shared inference model while keeping privacy of training data on clients. Recently, many heuristics efforts have been made to generalize centralized adaptive optimization methods, such as SGDM, Adam, AdaGrad, etc., to federated settings for improving convergence and accuracy. However, there is still a paucity of theoretical principles on where to and how to design and utilize adaptive optimization methods in federated settings. This work aims to develop novel adaptive optimization methods for FL from the perspective of dynamics of ordinary differential equations (ODEs). First, an analytic framework is established to build a connection between federated optimization methods and decompositions of ODEs of corresponding centralized optimizers. Second, based on this analytic framework, a momentum decoupling adaptive optimization method, FedDA, is developed to fully utilize the global momentum on each local iteration and accelerate the training convergence. Last but not least, full batch gradients are utilized to mimic centralized optimization in the end of the training process to ensure the convergence and overcome the possible inconsistency caused by adaptive optimization methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/jin22e/jin22e.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/jin22e-supp.zip", "pdf_size": 776468, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6719255961638637066&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Auburn University, USA; Auburn University, USA; Auburn University, USA; Sony AI, Japan; Baidu Research, China; Baidu Research, China + University of Oregon, USA", "aff_domain": "auburn.edu;auburn.edu;auburn.edu;sony.com;baidu.com;baidu.com", "email": "auburn.edu;auburn.edu;auburn.edu;sony.com;baidu.com;baidu.com", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/jin22e.html", "aff_unique_index": "0;0;0;1;2;2+3", "aff_unique_norm": "Auburn University;Sony AI;Baidu;University of Oregon", "aff_unique_dep": ";;Baidu Research;", "aff_unique_url": "https://www.auburn.edu;https://ai.sony.com;https://research.baidu.com;https://www.uoregon.edu", "aff_unique_abbr": "Auburn;Sony AI;Baidu;UO", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;2;2+0", "aff_country_unique": "United States;Japan;China" }, { "title": "Accelerated Gradient Methods for Geodesically Convex Optimization: Tractable Algorithms and Convergence Analysis", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17547", "id": "17547", "proceeding": "https://proceedings.mlr.press/v162/kim22k.html", "poster": "/media/PosterPDFs/ICML%202022/d5da28d4865fb92720359db84e0dd0dd.png?t=1657827049.003283", "slides": "", "author_site": "Jungbin Kim, Insoon Yang", "author": "Jungbin Kim; Insoon Yang", "abstract": "We propose computationally tractable accelerated first-order methods for Riemannian optimization, extending the Nesterov accelerated gradient (NAG) method. For both geodesically convex and geodesically strongly convex objective functions, our algorithms are shown to have the same iteration complexities as those for the NAG method on Euclidean spaces, under only standard assumptions. To the best of our knowledge, the proposed scheme is the first fully accelerated method for geodesically convex optimization problems. Our convergence analysis makes use of novel metric distortion lemmas as well as carefully designed potential functions. A connection with the continuous-time dynamics for modeling Riemannian acceleration in (Alimisis et al., 2020) is also identified by letting the stepsize tend to zero. We validate our theoretical results through numerical experiments.", "bibtex": "@InProceedings{pmlr-v162-kim22k,\n title = \t {Accelerated Gradient Methods for Geodesically Convex Optimization: Tractable Algorithms and Convergence Analysis},\n author = {Kim, Jungbin and Yang, Insoon},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11255--11282},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kim22k/kim22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/kim22k.html},\n abstract = \t {We propose computationally tractable accelerated first-order methods for Riemannian optimization, extending the Nesterov accelerated gradient (NAG) method. For both geodesically convex and geodesically strongly convex objective functions, our algorithms are shown to have the same iteration complexities as those for the NAG method on Euclidean spaces, under only standard assumptions. To the best of our knowledge, the proposed scheme is the first fully accelerated method for geodesically convex optimization problems. Our convergence analysis makes use of novel metric distortion lemmas as well as carefully designed potential functions. A connection with the continuous-time dynamics for modeling Riemannian acceleration in (Alimisis et al., 2020) is also identified by letting the stepsize tend to zero. We validate our theoretical results through numerical experiments.}\n}", "pdf": "https://proceedings.mlr.press/v162/kim22k/kim22k.pdf", "supp": "", "pdf_size": 1106929, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18354530104939877032&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Electrical and Computer Engineering, Seoul National University, Seoul, South Korea; Department of Electrical and Computer Engineering, Seoul National University, Seoul, South Korea", "aff_domain": "snu.ac.kr;snu.ac.kr", "email": "snu.ac.kr;snu.ac.kr", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/kim22k.html", "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Accelerated, Optimal and Parallel: Some results on model-based stochastic optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16341", "id": "16341", "proceeding": "https://proceedings.mlr.press/v162/chadha22a.html", "poster": "", "slides": "", "author_site": "Karan Chadha, Gary Cheng, John Duchi", "author": "Karan Chadha; Gary Cheng; John Duchi", "abstract": "The Approximate-Proximal Point (APROX) family of model-based stochastic optimization algorithms improve over standard stochastic gradient methods, as they are robust to step size choices, adaptive to problem difficulty, converge on a broader range of problems than stochastic gradient methods, and converge very fast on interpolation problems, all while retaining nice minibatching properties\u00a0\\cite{AsiDu19siopt,AsiChChDu20}. In this paper, we propose an acceleration scheme for the APROX family and provide non-asymptotic convergence guarantees, which are order-optimal in all problem-dependent constants and provide even larger minibatching speedups. For interpolation problems where the objective satisfies additional growth conditions, we show that our algorithm achieves linear convergence rates for a wide range of stepsizes. In this setting, we also prove matching lower bounds, identifying new fundamental constants and showing the optimality of the APROX family. We corroborate our theoretical results with empirical testing to demonstrate the gains accurate modeling, acceleration, and minibatching provide.", "bibtex": "@InProceedings{pmlr-v162-chadha22a,\n title = \t {Accelerated, Optimal and Parallel: Some results on model-based stochastic optimization},\n author = {Chadha, Karan and Cheng, Gary and Duchi, John},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2811--2827},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chadha22a/chadha22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chadha22a.html},\n abstract = \t {The Approximate-Proximal Point (APROX) family of model-based stochastic optimization algorithms improve over standard stochastic gradient methods, as they are robust to step size choices, adaptive to problem difficulty, converge on a broader range of problems than stochastic gradient methods, and converge very fast on interpolation problems, all while retaining nice minibatching properties\u00a0\\cite{AsiDu19siopt,AsiChChDu20}. In this paper, we propose an acceleration scheme for the APROX family and provide non-asymptotic convergence guarantees, which are order-optimal in all problem-dependent constants and provide even larger minibatching speedups. For interpolation problems where the objective satisfies additional growth conditions, we show that our algorithm achieves linear convergence rates for a wide range of stepsizes. In this setting, we also prove matching lower bounds, identifying new fundamental constants and showing the optimality of the APROX family. We corroborate our theoretical results with empirical testing to demonstrate the gains accurate modeling, acceleration, and minibatching provide.}\n}", "pdf": "https://proceedings.mlr.press/v162/chadha22a/chadha22a.pdf", "supp": "", "pdf_size": 530822, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16817342190373101252&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Electrical Engineering Department, Stanford University; Electrical Engineering Department, Stanford University; Electrical Engineering Department, Stanford University + Statistics Department, Stanford University", "aff_domain": "stanford.edu;stanford.edu; ", "email": "stanford.edu;stanford.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chadha22a.html", "aff_unique_index": "0;0;0+0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Electrical Engineering Department", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0+0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0+0", "aff_country_unique": "United States" }, { "title": "Accelerating Bayesian Optimization for Biological Sequence Design with Denoising Autoencoders", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18027", "id": "18027", "proceeding": "https://proceedings.mlr.press/v162/stanton22a.html", "poster": "/media/PosterPDFs/ICML%202022/34ad9bc83e3c72c62281cb2c744ac966_McpNQsy.png?t=1657907082.090144", "slides": "", "author_site": "Samuel Stanton, Wesley Maddox, Nate Gruver, Phillip Maffettone, Emily Delaney, Peyton Greenside, Andrew Wilson", "author": "Samuel Stanton; Wesley Maddox; Nate Gruver; Phillip Maffettone; Emily Delaney; Peyton Greenside; Andrew Gordon Wilson", "abstract": "Bayesian optimization (BayesOpt) is a gold standard for query-efficient continuous optimization. However, its adoption for drug design has been hindered by the discrete, high-dimensional nature of the decision variables. We develop a new approach (LaMBO) which jointly trains a denoising autoencoder with a discriminative multi-task Gaussian process head, allowing gradient-based optimization of multi-objective acquisition functions in the latent space of the autoencoder. These acquisition functions allow LaMBO to balance the explore-exploit tradeoff over multiple design rounds, and to balance objective tradeoffs by optimizing sequences at many different points on the Pareto frontier. We evaluate LaMBO on two small-molecule design tasks, and introduce new tasks optimizing in silico and in vitro properties of large-molecule fluorescent proteins. In our experiments LaMBO outperforms genetic optimizers and does not require a large pretraining corpus, demonstrating that BayesOpt is practical and effective for biological sequence design.", "bibtex": "@InProceedings{pmlr-v162-stanton22a,\n title = \t {Accelerating {B}ayesian Optimization for Biological Sequence Design with Denoising Autoencoders},\n author = {Stanton, Samuel and Maddox, Wesley and Gruver, Nate and Maffettone, Phillip and Delaney, Emily and Greenside, Peyton and Wilson, Andrew Gordon},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20459--20478},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/stanton22a/stanton22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/stanton22a.html},\n abstract = \t {Bayesian optimization (BayesOpt) is a gold standard for query-efficient continuous optimization. However, its adoption for drug design has been hindered by the discrete, high-dimensional nature of the decision variables. We develop a new approach (LaMBO) which jointly trains a denoising autoencoder with a discriminative multi-task Gaussian process head, allowing gradient-based optimization of multi-objective acquisition functions in the latent space of the autoencoder. These acquisition functions allow LaMBO to balance the explore-exploit tradeoff over multiple design rounds, and to balance objective tradeoffs by optimizing sequences at many different points on the Pareto frontier. We evaluate LaMBO on two small-molecule design tasks, and introduce new tasks optimizing in silico and in vitro properties of large-molecule fluorescent proteins. In our experiments LaMBO outperforms genetic optimizers and does not require a large pretraining corpus, demonstrating that BayesOpt is practical and effective for biological sequence design.}\n}", "pdf": "https://proceedings.mlr.press/v162/stanton22a/stanton22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/stanton22a-supp.zip", "pdf_size": 1072072, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2506639909996415595&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Center for Data Science, New York University, New York, USA+Courant Institute of Mathematical Sciences, New York University, New York, USA; Center for Data Science, New York University, New York, USA+Courant Institute of Mathematical Sciences, New York University, New York, USA; BigHat Biosciences, San Mateo, CA, USA; BigHat Biosciences, San Mateo, CA, USA; BigHat Biosciences, San Mateo, CA, USA; BigHat Biosciences, San Mateo, CA, USA; Center for Data Science, New York University, New York, USA+Courant Institute of Mathematical Sciences, New York University, New York, USA+BigHat Biosciences, San Mateo, CA, USA", "aff_domain": "nyu.edu; ; ; ; ; ; ", "email": "nyu.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/stanton22a.html", "aff_unique_index": "0+0;0+0;1;1;1;1;0+0+1", "aff_unique_norm": "New York University;BigHat Biosciences", "aff_unique_dep": "Center for Data Science;", "aff_unique_url": "https://www.nyu.edu;", "aff_unique_abbr": "NYU;", "aff_campus_unique_index": "0+0;0+0;1;1;1;1;0+0+1", "aff_campus_unique": "New York;San Mateo", "aff_country_unique_index": "0+0;0+0;0;0;0;0;0+0+0", "aff_country_unique": "United States" }, { "title": "Accelerating Shapley Explanation via Contributive Cooperator Selection", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16673", "id": "16673", "proceeding": "https://proceedings.mlr.press/v162/wang22b.html", "poster": "", "slides": "", "author_site": "Guanchu Wang, Yu-Neng Chuang, Mengnan Du, Fan Yang, Quan Zhou, Pushkar Tripathi, Xuanting Cai, Xia Hu", "author": "Guanchu Wang; Yu-Neng Chuang; Mengnan Du; Fan Yang; Quan Zhou; Pushkar Tripathi; Xuanting Cai; Xia Hu", "abstract": "Even though Shapley value provides an effective explanation for a DNN model prediction, the computation relies on the enumeration of all possible input feature coalitions, which leads to the exponentially growing complexity. To address this problem, we propose a novel method SHEAR to significantly accelerate the Shapley explanation for DNN models, where only a few coalitions of input features are involved in the computation. The selection of the feature coalitions follows our proposed Shapley chain rule to minimize the absolute error from the ground-truth Shapley values, such that the computation can be both efficient and accurate. To demonstrate the effectiveness, we comprehensively evaluate SHEAR across multiple metrics including the absolute error from the ground-truth Shapley value, the faithfulness of the explanations, and running speed. The experimental results indicate SHEAR consistently outperforms state-of-the-art baseline methods across different evaluation metrics, which demonstrates its potentials in real-world applications where the computational resource is limited.", "bibtex": "@InProceedings{pmlr-v162-wang22b,\n title = \t {Accelerating Shapley Explanation via Contributive Cooperator Selection},\n author = {Wang, Guanchu and Chuang, Yu-Neng and Du, Mengnan and Yang, Fan and Zhou, Quan and Tripathi, Pushkar and Cai, Xuanting and Hu, Xia},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22576--22590},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22b/wang22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22b.html},\n abstract = \t {Even though Shapley value provides an effective explanation for a DNN model prediction, the computation relies on the enumeration of all possible input feature coalitions, which leads to the exponentially growing complexity. To address this problem, we propose a novel method SHEAR to significantly accelerate the Shapley explanation for DNN models, where only a few coalitions of input features are involved in the computation. The selection of the feature coalitions follows our proposed Shapley chain rule to minimize the absolute error from the ground-truth Shapley values, such that the computation can be both efficient and accurate. To demonstrate the effectiveness, we comprehensively evaluate SHEAR across multiple metrics including the absolute error from the ground-truth Shapley value, the faithfulness of the explanations, and running speed. The experimental results indicate SHEAR consistently outperforms state-of-the-art baseline methods across different evaluation metrics, which demonstrates its potentials in real-world applications where the computational resource is limited.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22b/wang22b.pdf", "supp": "", "pdf_size": 763729, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2493376524235633954&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Rice University; Department of Computer Science, Rice University; Department of Computer Science and Engineering, Texas A&M University; Department of Computer Science, Rice University; Meta Platforms, Inc.; Meta Platforms, Inc.; Meta Platforms, Inc.; Department of Computer Science, Rice University", "aff_domain": "rice.edu;rice.edu;tamu.edu;rice.edu;meta.com;meta.com;meta.com;rice.edu", "email": "rice.edu;rice.edu;tamu.edu;rice.edu;meta.com;meta.com;meta.com;rice.edu", "github": "https://github.com/guanchuwang/SHEAR", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/wang22b.html", "aff_unique_index": "0;0;1;0;2;2;2;0", "aff_unique_norm": "Rice University;Texas A&M University;Meta", "aff_unique_dep": "Department of Computer Science;Department of Computer Science and Engineering;Meta Platforms, Inc.", "aff_unique_url": "https://www.rice.edu;https://www.tamu.edu;https://www.meta.com", "aff_unique_abbr": "Rice;TAMU;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Accurate Quantization of Measures via Interacting Particle-based Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18285", "id": "18285", "proceeding": "https://proceedings.mlr.press/v162/xu22d.html", "poster": "/media/PosterPDFs/ICML%202022/109d2dd3608f669ca17920c511c2a41e_aVjgVh0.png?t=1657580919.9079356", "slides": "/media/icml-2022/Slides/18285.pdf", "author_site": "Lantian Xu, Anna Korba, Dejan Slepcev", "author": "Lantian Xu; Anna Korba; Dejan Slepcev", "abstract": "Approximating a target probability distribution can be cast as an optimization problem where the objective functional measures the dissimilarity to the target. This optimization can be addressed by approximating Wasserstein and related gradient flows. In practice, these are simulated by interacting particle systems, whose stationary states define an empirical measure approximating the target distribution. This approach has been popularized recently to design sampling algorithms, e.g. Stein Variational Gradient Descent, or by minimizing the Maximum Mean or Kernel Stein Discrepancy. However, little is known about quantization properties of these approaches, i.e. how well is the target approximated by a finite number particles. We investigate this question theoretically and numerically. In particular, we prove general upper bounds on the quantization error of MMD and KSD at rates which significantly outperform quantization by i.i.d. samples. We conduct experiments which show that the particle systems at study achieve fast rates in practice, and notably outperform greedy algorithms, such as kernel herding. We compare different gradient flows and highlight their quantization rates. Furthermore we introduce a Normalized Stein Variational Gradient Descent and argue in favor of adaptive kernels, which exhibit faster convergence. Finally we compare the Gaussian and Laplace kernels and argue that the Laplace kernel provides a more robust quantization.", "bibtex": "@InProceedings{pmlr-v162-xu22d,\n title = \t {Accurate Quantization of Measures via Interacting Particle-based Optimization},\n author = {Xu, Lantian and Korba, Anna and Slepcev, Dejan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24576--24595},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22d/xu22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22d.html},\n abstract = \t {Approximating a target probability distribution can be cast as an optimization problem where the objective functional measures the dissimilarity to the target. This optimization can be addressed by approximating Wasserstein and related gradient flows. In practice, these are simulated by interacting particle systems, whose stationary states define an empirical measure approximating the target distribution. This approach has been popularized recently to design sampling algorithms, e.g. Stein Variational Gradient Descent, or by minimizing the Maximum Mean or Kernel Stein Discrepancy. However, little is known about quantization properties of these approaches, i.e. how well is the target approximated by a finite number particles. We investigate this question theoretically and numerically. In particular, we prove general upper bounds on the quantization error of MMD and KSD at rates which significantly outperform quantization by i.i.d. samples. We conduct experiments which show that the particle systems at study achieve fast rates in practice, and notably outperform greedy algorithms, such as kernel herding. We compare different gradient flows and highlight their quantization rates. Furthermore we introduce a Normalized Stein Variational Gradient Descent and argue in favor of adaptive kernels, which exhibit faster convergence. Finally we compare the Gaussian and Laplace kernels and argue that the Laplace kernel provides a more robust quantization.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22d/xu22d.pdf", "supp": "", "pdf_size": 1135523, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17003284511305807860&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Carnegie Mellon University; CREST, ENSAE, IP Paris; Carnegie Mellon University", "aff_domain": "andrew.cmu.edu; ; ", "email": "andrew.cmu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/xu22d.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;CREST", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.crest.fr", "aff_unique_abbr": "CMU;CREST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;France" }, { "title": "Achieving Fairness at No Utility Cost via Data Reweighing with Influence", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17309", "id": "17309", "proceeding": "https://proceedings.mlr.press/v162/li22p.html", "poster": "/media/PosterPDFs/ICML%202022/9c838d2e45b2ad1094d42f4ef36764f6.png?t=1657686610.7029245", "slides": "", "author_site": "Peizhao Li, Hongfu Liu", "author": "Peizhao Li; Hongfu Liu", "abstract": "With the fast development of algorithmic governance, fairness has become a compulsory property for machine learning models to suppress unintentional discrimination. In this paper, we focus on the pre-processing aspect for achieving fairness, and propose a data reweighing approach that only adjusts the weight for samples in the training phase. Different from most previous reweighing methods which usually assign a uniform weight for each (sub)group, we granularly model the influence of each training sample with regard to fairness-related quantity and predictive utility, and compute individual weights based on influence under the constraints from both fairness and utility. Experimental results reveal that previous methods achieve fairness at a non-negligible cost of utility, while as a significant advantage, our approach can empirically release the tradeoff and obtain cost-free fairness for equal opportunity. We demonstrate the cost-free fairness through vanilla classifiers and standard training processes, compared to baseline methods on multiple real-world tabular datasets. Code available at https://github.com/brandeis-machine-learning/influence-fairness.", "bibtex": "@InProceedings{pmlr-v162-li22p,\n title = \t {Achieving Fairness at No Utility Cost via Data Reweighing with Influence},\n author = {Li, Peizhao and Liu, Hongfu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12917--12930},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22p/li22p.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22p.html},\n abstract = \t {With the fast development of algorithmic governance, fairness has become a compulsory property for machine learning models to suppress unintentional discrimination. In this paper, we focus on the pre-processing aspect for achieving fairness, and propose a data reweighing approach that only adjusts the weight for samples in the training phase. Different from most previous reweighing methods which usually assign a uniform weight for each (sub)group, we granularly model the influence of each training sample with regard to fairness-related quantity and predictive utility, and compute individual weights based on influence under the constraints from both fairness and utility. Experimental results reveal that previous methods achieve fairness at a non-negligible cost of utility, while as a significant advantage, our approach can empirically release the tradeoff and obtain cost-free fairness for equal opportunity. We demonstrate the cost-free fairness through vanilla classifiers and standard training processes, compared to baseline methods on multiple real-world tabular datasets. Code available at https://github.com/brandeis-machine-learning/influence-fairness.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22p/li22p.pdf", "supp": "", "pdf_size": 1801517, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1481946580804842338&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Brandeis University; Brandeis University", "aff_domain": "brandeis.edu; ", "email": "brandeis.edu; ", "github": "https://github.com/brandeis-machine-learning/influence-fairness", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/li22p.html", "aff_unique_index": "0;0", "aff_unique_norm": "Brandeis University", "aff_unique_dep": "", "aff_unique_url": "https://www.brandeis.edu", "aff_unique_abbr": "Brandeis", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Achieving Minimax Rates in Pool-Based Batch Active Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16505", "id": "16505", "proceeding": "https://proceedings.mlr.press/v162/gentile22a.html", "poster": "/media/PosterPDFs/ICML%202022/e769e03a9d329b2e864b4bf4ff54ff39_u3yk6yq.png?t=1657162854.5518951", "slides": "/media/icml-2022/Slides/16505_DOxSoFf.pdf", "author_site": "Claudio Gentile, Zhilei Wang, Tong Zhang", "author": "Claudio Gentile; Zhilei Wang; Tong Zhang", "abstract": "We consider a batch active learning scenario where the learner adaptively issues batches of points to a labeling oracle. Sampling labels in batches is highly desirable in practice due to the smaller number of interactive rounds with the labeling oracle (often human beings). However, batch active learning typically pays the price of a reduced adaptivity, leading to suboptimal results. In this paper we propose a solution which requires a careful trade off between the informativeness of the queried points and their diversity. We theoretically investigate batch active learning in the practically relevant scenario where the unlabeled pool of data is available beforehand (", "bibtex": "@InProceedings{pmlr-v162-gentile22a,\n title = \t {Achieving Minimax Rates in Pool-Based Batch Active Learning},\n author = {Gentile, Claudio and Wang, Zhilei and Zhang, Tong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7339--7367},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gentile22a/gentile22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gentile22a.html},\n abstract = \t {We consider a batch active learning scenario where the learner adaptively issues batches of points to a labeling oracle. Sampling labels in batches is highly desirable in practice due to the smaller number of interactive rounds with the labeling oracle (often human beings). However, batch active learning typically pays the price of a reduced adaptivity, leading to suboptimal results. In this paper we propose a solution which requires a careful trade off between the informativeness of the queried points and their diversity. We theoretically investigate batch active learning in the practically relevant scenario where the unlabeled pool of data is available beforehand (", "pdf": "https://proceedings.mlr.press/v162/gentile22a/gentile22a.pdf", "supp": "", "pdf_size": 447153, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6956157902451559992&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Google Research, New York; Citadel Securities, New York; The Hong Kong University of Science and Technology, Hong Kong", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/gentile22a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Google;Citadel Securities;Hong Kong University of Science and Technology", "aff_unique_dep": "Google Research;;", "aff_unique_url": "https://research.google;https://www.citadel.com;https://www.ust.hk", "aff_unique_abbr": "Google;Citadel;HKUST", "aff_campus_unique_index": "0;2", "aff_campus_unique": "New York;;Hong Kong SAR", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;China" }, { "title": "Action-Sufficient State Representation Learning for Control with Structural Constraints", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17323", "id": "17323", "proceeding": "https://proceedings.mlr.press/v162/huang22f.html", "poster": "/media/PosterPDFs/ICML%202022/2475c20d9e9a1aaee80dcbc4e6316157_O3MmaXs.png?t=1657164170.6018019", "slides": "", "author_site": "Biwei Huang, Chaochao Lu, Liu Leqi, Jose Miguel Hernandez-Lobato, Clark Glymour, Bernhard Sch\u00f6lkopf, Kun Zhang", "author": "Biwei Huang; Chaochao Lu; Liu Leqi; Jose Miguel Hernandez-Lobato; Clark Glymour; Bernhard Sch\u00f6lkopf; Kun Zhang", "abstract": "Perceived signals in real-world scenarios are usually high-dimensional and noisy, and finding and using their representation that contains essential and sufficient information required by downstream decision-making tasks will help improve computational efficiency and generalization ability in the tasks. In this paper, we focus on partially observable environments and propose to learn a minimal set of state representations that capture sufficient information for decision-making, termed Action-Sufficient state Representations (ASRs). We build a generative environment model for the structural relationships among variables in the system and present a principled way to characterize ASRs based on structural constraints and the goal of maximizing cumulative reward in policy learning. We then develop a structured sequential Variational Auto-Encoder to estimate the environment model and extract ASRs. Our empirical results on CarRacing and VizDoom demonstrate a clear advantage of learning and using ASRs for policy learning. Moreover, the estimated environment model and ASRs allow learning behaviors from imagined outcomes in the compact latent space to improve sample efficiency.", "bibtex": "@InProceedings{pmlr-v162-huang22f,\n title = \t {Action-Sufficient State Representation Learning for Control with Structural Constraints},\n author = {Huang, Biwei and Lu, Chaochao and Leqi, Liu and Hernandez-Lobato, Jose Miguel and Glymour, Clark and Sch{\\\"o}lkopf, Bernhard and Zhang, Kun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9260--9279},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22f/huang22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22f.html},\n abstract = \t {Perceived signals in real-world scenarios are usually high-dimensional and noisy, and finding and using their representation that contains essential and sufficient information required by downstream decision-making tasks will help improve computational efficiency and generalization ability in the tasks. In this paper, we focus on partially observable environments and propose to learn a minimal set of state representations that capture sufficient information for decision-making, termed Action-Sufficient state Representations (ASRs). We build a generative environment model for the structural relationships among variables in the system and present a principled way to characterize ASRs based on structural constraints and the goal of maximizing cumulative reward in policy learning. We then develop a structured sequential Variational Auto-Encoder to estimate the environment model and extract ASRs. Our empirical results on CarRacing and VizDoom demonstrate a clear advantage of learning and using ASRs for policy learning. Moreover, the estimated environment model and ASRs allow learning behaviors from imagined outcomes in the compact latent space to improve sample efficiency.}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22f/huang22f.pdf", "supp": "", "pdf_size": 1099686, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7828312335122233004&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Carnegie Mellon University; University of Cambridge + Max Planck Institute for Intelligent Systems, T\u00fcbingen; Carnegie Mellon University; University of Cambridge; Carnegie Mellon University; Max Planck Institute for Intelligent Systems, T\u00fcbingen; Mohamed bin Zayed University of Arti\ufb01cial Intelligence", "aff_domain": "cmu.edu; ; ; ; ; ;cmu.edu", "email": "cmu.edu; ; ; ; ; ;cmu.edu", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/huang22f.html", "aff_unique_index": "0;1+2;0;1;0;2;3", "aff_unique_norm": "Carnegie Mellon University;University of Cambridge;Max Planck Institute for Intelligent Systems;Mohamed bin Zayed University of Artificial Intelligence", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cmu.edu;https://www.cam.ac.uk;https://www.mpi-is.mpg.de;https://mbzuai.ac.ae", "aff_unique_abbr": "CMU;Cambridge;MPI-IS;MBZUAI", "aff_campus_unique_index": "1+2;1;2", "aff_campus_unique": ";Cambridge;T\u00fcbingen", "aff_country_unique_index": "0;1+2;0;1;0;2;3", "aff_country_unique": "United States;United Kingdom;Germany;United Arab Emirates" }, { "title": "Active Learning on a Budget: Opposite Strategies Suit High and Low Budgets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17719", "id": "17719", "proceeding": "https://proceedings.mlr.press/v162/hacohen22a.html", "poster": "/media/PosterPDFs/ICML%202022/59bcda7c438bad7d2afffe9e2fed00be.png?t=1657487927.623183", "slides": "", "author_site": "Guy Hacohen, Avihu Dekel, Daphna Weinshall", "author": "Guy Hacohen; Avihu Dekel; Daphna Weinshall", "abstract": "Investigating active learning, we focus on the relation between the number of labeled examples (budget size), and suitable querying strategies. Our theoretical analysis shows a behavior reminiscent of phase transition: typical examples are best queried when the budget is low, while unrepresentative examples are best queried when the budget is large. Combined evidence shows that a similar phenomenon occurs in common classification models. Accordingly, we propose TypiClust \u2013 a deep active learning strategy suited for low budgets. In a comparative empirical investigation of supervised learning, using a variety of architectures and image datasets, TypiClust outperforms all other active learning strategies in the low-budget regime. Using TypiClust in the semi-supervised framework, performance gets an even more significant boost. In particular, state-of-the-art semi-supervised methods trained on CIFAR-10 with 10 labeled examples selected by TypiClust, reach 93.2% accuracy \u2013 an improvement of 39.4% over random selection. Code is available at https://github.com/avihu111/TypiClust.", "bibtex": "@InProceedings{pmlr-v162-hacohen22a,\n title = \t {Active Learning on a Budget: Opposite Strategies Suit High and Low Budgets},\n author = {Hacohen, Guy and Dekel, Avihu and Weinshall, Daphna},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8175--8195},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hacohen22a/hacohen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hacohen22a.html},\n abstract = \t {Investigating active learning, we focus on the relation between the number of labeled examples (budget size), and suitable querying strategies. Our theoretical analysis shows a behavior reminiscent of phase transition: typical examples are best queried when the budget is low, while unrepresentative examples are best queried when the budget is large. Combined evidence shows that a similar phenomenon occurs in common classification models. Accordingly, we propose TypiClust \u2013 a deep active learning strategy suited for low budgets. In a comparative empirical investigation of supervised learning, using a variety of architectures and image datasets, TypiClust outperforms all other active learning strategies in the low-budget regime. Using TypiClust in the semi-supervised framework, performance gets an even more significant boost. In particular, state-of-the-art semi-supervised methods trained on CIFAR-10 with 10 labeled examples selected by TypiClust, reach 93.2% accuracy \u2013 an improvement of 39.4% over random selection. Code is available at https://github.com/avihu111/TypiClust.}\n}", "pdf": "https://proceedings.mlr.press/v162/hacohen22a/hacohen22a.pdf", "supp": "", "pdf_size": 8919055, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7933856557848734665&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "School of Computer Science and Engineering, The Hebrew University of Jerusalem, Jerusalem, Israel+Edmond & Lily Safra Center for Brain Sciences, The Hebrew University of Jerusalem, Jerusalem, Israel; School of Computer Science and Engineering, The Hebrew University of Jerusalem, Jerusalem, Israel+Edmond & Lily Safra Center for Brain Sciences, The Hebrew University of Jerusalem, Jerusalem, Israel; School of Computer Science and Engineering, The Hebrew University of Jerusalem, Jerusalem, Israel", "aff_domain": "mail.huji.ac.il;mail.huji.ac.il;cs.huji.ac.il", "email": "mail.huji.ac.il;mail.huji.ac.il;cs.huji.ac.il", "github": "https://github.com/avihu111/TypiClust", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/hacohen22a.html", "aff_unique_index": "0+0;0+0;0", "aff_unique_norm": "Hebrew University of Jerusalem", "aff_unique_dep": "School of Computer Science and Engineering", "aff_unique_url": "http://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "0+0;0+0;0", "aff_campus_unique": "Jerusalem", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "Israel" }, { "title": "Active Multi-Task Representation Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16863", "id": "16863", "proceeding": "https://proceedings.mlr.press/v162/chen22j.html", "poster": "", "slides": "", "author_site": "Yifang Chen, Kevin Jamieson, Simon Du", "author": "Yifang Chen; Kevin Jamieson; Simon Du", "abstract": "To leverage the power of big data from source domains and overcome the scarcity of target domain samples, representation learning based on multi-task pretraining has become a standard approach in many applications. However, large-scale pretraining is often computationally expensive and not affordable for small organizations. When there is only one target task, most source tasks can be irrelevant, and we can actively sample a subset of source data from the most To leverage the power of big data from source tasks and overcome the scarcity of the target task samples, representation learning based on multi-task pretraining has become a standard approach in many applications. However, up until now, choosing which source tasks to include in the multi-task learning has been more art than science. In this paper, we give the first formal study on resource task sampling by leveraging the techniques from active learning. We propose an algorithm that iteratively estimates the relevance of each source task to the target task and samples from each source task based on the estimated relevance. Theoretically, we show that for the linear representation class, to achieve the same error rate, our algorithm can save up to a textit{number of source tasks} factor in the source task sample complexity, compared with the naive uniform sampling from all source tasks. We also provide experiments on real-world computer vision datasets to illustrate the effectiveness of our proposed method on both linear and convolutional neural network representation classes. We believe our paper serves as an important initial step to bring techniques from active learning to representation learning.", "bibtex": "@InProceedings{pmlr-v162-chen22j,\n title = \t {Active Multi-Task Representation Learning},\n author = {Chen, Yifang and Jamieson, Kevin and Du, Simon},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3271--3298},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22j/chen22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22j.html},\n abstract = \t {To leverage the power of big data from source domains and overcome the scarcity of target domain samples, representation learning based on multi-task pretraining has become a standard approach in many applications. However, large-scale pretraining is often computationally expensive and not affordable for small organizations. When there is only one target task, most source tasks can be irrelevant, and we can actively sample a subset of source data from the most To leverage the power of big data from source tasks and overcome the scarcity of the target task samples, representation learning based on multi-task pretraining has become a standard approach in many applications. However, up until now, choosing which source tasks to include in the multi-task learning has been more art than science. In this paper, we give the first formal study on resource task sampling by leveraging the techniques from active learning. We propose an algorithm that iteratively estimates the relevance of each source task to the target task and samples from each source task based on the estimated relevance. Theoretically, we show that for the linear representation class, to achieve the same error rate, our algorithm can save up to a textit{number of source tasks} factor in the source task sample complexity, compared with the naive uniform sampling from all source tasks. We also provide experiments on real-world computer vision datasets to illustrate the effectiveness of our proposed method on both linear and convolutional neural network representation classes. We believe our paper serves as an important initial step to bring techniques from active learning to representation learning.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22j/chen22j.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chen22j-supp.zip", "pdf_size": 1036026, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10334957847005481663&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Paul G. Allen School of Computer Science & Engineering, University of Washington; Paul G. Allen School of Computer Science & Engineering, University of Washington; Paul G. Allen School of Computer Science & Engineering, University of Washington", "aff_domain": "cs.washington.edu;cs.washington.edu;cs.washington.edu", "email": "cs.washington.edu;cs.washington.edu;cs.washington.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22j.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "Paul G. Allen School of Computer Science & Engineering", "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seattle", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Active Nearest Neighbor Regression Through Delaunay Refinement", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16973", "id": "16973", "proceeding": "https://proceedings.mlr.press/v162/kravberg22a.html", "poster": "/media/PosterPDFs/ICML%202022/6dd3e6a48b7117f6ae04a6664beb740b_JVOJqRP.png?t=1656490248.7568548", "slides": "", "author_site": "Alexander Kravberg, Giovanni Luca Marchetti, Vladislav Polianskii, Anastasiia Varava, Florian T. Pokorny, Danica Kragic", "author": "Alexander Kravberg; Giovanni Luca Marchetti; Vladislav Polianskii; Anastasiia Varava; Florian T. Pokorny; Danica Kragic", "abstract": "We introduce an algorithm for active function approximation based on nearest neighbor regression. Our Active Nearest Neighbor Regressor (ANNR) relies on the Voronoi-Delaunay framework from computational geometry to subdivide the space into cells with constant estimated function value and select novel query points in a way that takes the geometry of the function graph into account. We consider the recent state-of-the-art active function approximator called DEFER, which is based on incremental rectangular partitioning of the space, as the main baseline. The ANNR addresses a number of limitations that arise from the space subdivision strategy used in DEFER. We provide a computationally efficient implementation of our method, as well as theoretical halting guarantees. Empirical results show that ANNR outperforms the baseline for both closed-form functions and real-world examples, such as gravitational wave parameter inference and exploration of the latent space of a generative model.", "bibtex": "@InProceedings{pmlr-v162-kravberg22a,\n title = \t {Active Nearest Neighbor Regression Through Delaunay Refinement},\n author = {Kravberg, Alexander and Marchetti, Giovanni Luca and Polianskii, Vladislav and Varava, Anastasiia and Pokorny, Florian T. and Kragic, Danica},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11650--11664},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kravberg22a/kravberg22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kravberg22a.html},\n abstract = \t {We introduce an algorithm for active function approximation based on nearest neighbor regression. Our Active Nearest Neighbor Regressor (ANNR) relies on the Voronoi-Delaunay framework from computational geometry to subdivide the space into cells with constant estimated function value and select novel query points in a way that takes the geometry of the function graph into account. We consider the recent state-of-the-art active function approximator called DEFER, which is based on incremental rectangular partitioning of the space, as the main baseline. The ANNR addresses a number of limitations that arise from the space subdivision strategy used in DEFER. We provide a computationally efficient implementation of our method, as well as theoretical halting guarantees. Empirical results show that ANNR outperforms the baseline for both closed-form functions and real-world examples, such as gravitational wave parameter inference and exploration of the latent space of a generative model.}\n}", "pdf": "https://proceedings.mlr.press/v162/kravberg22a/kravberg22a.pdf", "supp": "", "pdf_size": 6435564, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7767457385905286857&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "School of Electrical Engineering and Computer Science, Royal Institute of Technology (KTH), Stockholm, Sweden; School of Electrical Engineering and Computer Science, Royal Institute of Technology (KTH), Stockholm, Sweden; School of Electrical Engineering and Computer Science, Royal Institute of Technology (KTH), Stockholm, Sweden; ; School of Electrical Engineering and Computer Science, Royal Institute of Technology (KTH), Stockholm, Sweden; School of Electrical Engineering and Computer Science, Royal Institute of Technology (KTH), Stockholm, Sweden", "aff_domain": "kth.se; ; ; ; ; ", "email": "kth.se; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/kravberg22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Royal Institute of Technology (KTH)", "aff_unique_dep": "School of Electrical Engineering and Computer Science", "aff_unique_url": "https://www.kth.se", "aff_unique_abbr": "KTH", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stockholm", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Sweden" }, { "title": "Active Sampling for Min-Max Fairness", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18331", "id": "18331", "proceeding": "https://proceedings.mlr.press/v162/abernethy22a.html", "poster": "/media/PosterPDFs/ICML%202022/8726bb30dc7ce15023daa8ff8402bcfd.png?t=1657736391.0037622", "slides": "", "author_site": "Jacob Abernethy, Pranjal Awasthi, Matth\u00e4us Kleindessner, Jamie Morgenstern, Chris Russell, Jie Zhang", "author": "Jacob D Abernethy; Pranjal Awasthi; Matth\u00e4us Kleindessner; Jamie Morgenstern; Chris Russell; Jie Zhang", "abstract": "We propose simple active sampling and reweighting strategies for optimizing min-max fairness that can be applied to any classification or regression model learned via loss minimization. The key intuition behind our approach is to use at each timestep a datapoint from the group that is worst off under the current model for updating the model. The ease of implementation and the generality of our robust formulation make it an attractive option for improving model performance on disadvantaged groups. For convex learning problems, such as linear or logistic regression, we provide a fine-grained analysis, proving the rate of convergence to a min-max fair solution.", "bibtex": "@InProceedings{pmlr-v162-abernethy22a,\n title = \t {Active Sampling for Min-Max Fairness},\n author = {Abernethy, Jacob D and Awasthi, Pranjal and Kleindessner, Matth{\\\"a}us and Morgenstern, Jamie and Russell, Chris and Zhang, Jie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {53--65},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/abernethy22a/abernethy22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/abernethy22a.html},\n abstract = \t {We propose simple active sampling and reweighting strategies for optimizing min-max fairness that can be applied to any classification or regression model learned via loss minimization. The key intuition behind our approach is to use at each timestep a datapoint from the group that is worst off under the current model for updating the model. The ease of implementation and the generality of our robust formulation make it an attractive option for improving model performance on disadvantaged groups. For convex learning problems, such as linear or logistic regression, we provide a fine-grained analysis, proving the rate of convergence to a min-max fair solution.}\n}", "pdf": "https://proceedings.mlr.press/v162/abernethy22a/abernethy22a.pdf", "supp": "", "pdf_size": 9489641, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7250212054919979465&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Georgia Tech, USA; Google, USA; Amazon Web Services, Germany; University of Washington, USA; Amazon Web Services, Germany; University of Washington, USA", "aff_domain": "gatech.edu;google.com;amazon.de;cs.washington.edu;amazon.de;uw.edu", "email": "gatech.edu;google.com;amazon.de;cs.washington.edu;amazon.de;uw.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/abernethy22a.html", "aff_unique_index": "0;1;2;3;2;3", "aff_unique_norm": "Georgia Institute of Technology;Google;Amazon;University of Washington", "aff_unique_dep": ";Google;Amazon Web Services;", "aff_unique_url": "https://www.gatech.edu;https://www.google.com;https://aws.amazon.com/de;https://www.washington.edu", "aff_unique_abbr": "Georgia Tech;Google;AWS;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;1;0", "aff_country_unique": "United States;Germany" }, { "title": "Active fairness auditing", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17203", "id": "17203", "proceeding": "https://proceedings.mlr.press/v162/yan22c.html", "poster": "/media/PosterPDFs/ICML%202022/768e78024aa8fdb9b8fe87be86f64745_OYuuObU.png?t=1658323371.1456988", "slides": "/media/icml-2022/Slides/17203.pdf", "author_site": "Tom Yan, Chicheng Zhang", "author": "Tom Yan; Chicheng Zhang", "abstract": "The fast spreading adoption of machine learning (ML) by companies across industries poses significant regulatory challenges. One such challenge is scalability: how can regulatory bodies efficiently", "bibtex": "@InProceedings{pmlr-v162-yan22c,\n title = \t {Active fairness auditing},\n author = {Yan, Tom and Zhang, Chicheng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24929--24962},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yan22c/yan22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/yan22c.html},\n abstract = \t {The fast spreading adoption of machine learning (ML) by companies across industries poses significant regulatory challenges. One such challenge is scalability: how can regulatory bodies efficiently", "pdf": "https://proceedings.mlr.press/v162/yan22c/yan22c.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/yan22c-supp.zip", "pdf_size": 705333, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4910061499731303388&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Carnegie Mellon University; University of Arizona", "aff_domain": "andrew.cmu.edu;cs.arizona.edu", "email": "andrew.cmu.edu;cs.arizona.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/yan22c.html", "aff_unique_index": "0;1", "aff_unique_norm": "Carnegie Mellon University;University of Arizona", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.arizona.edu", "aff_unique_abbr": "CMU;UA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "ActiveHedge: Hedge meets Active Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16965", "id": "16965", "proceeding": "https://proceedings.mlr.press/v162/kumar22a.html", "poster": "/media/PosterPDFs/ICML%202022/e0854e3c03ec877be65d351b90680d46.png?t=1658012937.4081707", "slides": "", "author_site": "Bhuvesh Kumar, Jacob Abernethy, Venkatesh Saligrama", "author": "Bhuvesh Kumar; Jacob D Abernethy; Venkatesh Saligrama", "abstract": "We consider the classical problem of multi-class prediction with expert advice, but with an active learning twist. In this new setting the learner will only query the labels of a small number of examples, but still aims to minimize regret to the best expert as usual; the learner is also allowed a very short \"burn-in\" phase where it can fast-forward and query certain highly-informative examples. We design an algorithm that utilizes Hedge (aka Exponential Weights) as a subroutine, and we show that under a very particular combinatorial constraint on the matrix of expert predictions we can obtain a very strong regret guarantee while querying very few labels. This constraint, which we refer to as $\\zeta$-compactness, or just compactness, can be viewed as a non-stochastic variant of the disagreement coefficient, another popular parameter used to reason about the sample complexity of active learning in the IID setting. We also give a polynomial-time algorithm to calculate the $\\zeta$-compactness of a matrix up to an approximation factor of 3.", "bibtex": "@InProceedings{pmlr-v162-kumar22a,\n title = \t {{A}ctive{H}edge: Hedge meets Active Learning},\n author = {Kumar, Bhuvesh and Abernethy, Jacob D and Saligrama, Venkatesh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11694--11709},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kumar22a/kumar22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kumar22a.html},\n abstract = \t {We consider the classical problem of multi-class prediction with expert advice, but with an active learning twist. In this new setting the learner will only query the labels of a small number of examples, but still aims to minimize regret to the best expert as usual; the learner is also allowed a very short \"burn-in\" phase where it can fast-forward and query certain highly-informative examples. We design an algorithm that utilizes Hedge (aka Exponential Weights) as a subroutine, and we show that under a very particular combinatorial constraint on the matrix of expert predictions we can obtain a very strong regret guarantee while querying very few labels. This constraint, which we refer to as $\\zeta$-compactness, or just compactness, can be viewed as a non-stochastic variant of the disagreement coefficient, another popular parameter used to reason about the sample complexity of active learning in the IID setting. We also give a polynomial-time algorithm to calculate the $\\zeta$-compactness of a matrix up to an approximation factor of 3.}\n}", "pdf": "https://proceedings.mlr.press/v162/kumar22a/kumar22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/kumar22a-supp.zip", "pdf_size": 466095, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10886347869406721956&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "aff": "Georgia Institute of Technology; Georgia Institute of Technology; Department of Electrical and Computer Engineering, Boston University", "aff_domain": "gatech.edu; ; ", "email": "gatech.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kumar22a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Georgia Institute of Technology;Boston University", "aff_unique_dep": ";Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.gatech.edu;https://www.bu.edu", "aff_unique_abbr": "Georgia Tech;BU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Actor-Critic based Improper Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17601", "id": "17601", "proceeding": "https://proceedings.mlr.press/v162/zaki22a.html", "poster": "/media/PosterPDFs/ICML%202022/64a7157cf3932bf74755aa3cf586f2ec_fvLzqo5.png?t=1657869464.1351042", "slides": "", "author_site": "Mohammadi Zaki, Avi Mohan, Aditya Gopalan, Shie Mannor", "author": "Mohammadi Zaki; Avi Mohan; Aditya Gopalan; Shie Mannor", "abstract": "We consider an improper reinforcement learning setting where a learner is given $M$ base controllers for an unknown Markov decision process, and wishes to combine them optimally to produce a potentially new controller that can outperform each of the base ones. This can be useful in tuning across controllers, learnt possibly in mismatched or simulated environments, to obtain a good controller for a given target environment with relatively few trials. Towards this, we propose two algorithms: (1) a Policy Gradient-based approach; and (2) an algorithm that can switch between a simple Actor-Critic (AC) based scheme and a Natural Actor-Critic (NAC) scheme depending on the available information. Both algorithms operate over a class of improper mixtures of the given controllers. For the first case, we derive convergence rate guarantees assuming access to a gradient oracle. For the AC-based approach we provide convergence rate guarantees to a stationary point in the basic AC case and to a global optimum in the NAC case. Numerical results on (i) the standard control theoretic benchmark of stabilizing an inverted pendulum; and (ii) a constrained queueing task show that our improper policy optimization algorithm can stabilize the system even when the base policies at its disposal are unstable.", "bibtex": "@InProceedings{pmlr-v162-zaki22a,\n title = \t {Actor-Critic based Improper Reinforcement Learning},\n author = {Zaki, Mohammadi and Mohan, Avi and Gopalan, Aditya and Mannor, Shie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25867--25919},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zaki22a/zaki22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zaki22a.html},\n abstract = \t {We consider an improper reinforcement learning setting where a learner is given $M$ base controllers for an unknown Markov decision process, and wishes to combine them optimally to produce a potentially new controller that can outperform each of the base ones. This can be useful in tuning across controllers, learnt possibly in mismatched or simulated environments, to obtain a good controller for a given target environment with relatively few trials. Towards this, we propose two algorithms: (1) a Policy Gradient-based approach; and (2) an algorithm that can switch between a simple Actor-Critic (AC) based scheme and a Natural Actor-Critic (NAC) scheme depending on the available information. Both algorithms operate over a class of improper mixtures of the given controllers. For the first case, we derive convergence rate guarantees assuming access to a gradient oracle. For the AC-based approach we provide convergence rate guarantees to a stationary point in the basic AC case and to a global optimum in the NAC case. Numerical results on (i) the standard control theoretic benchmark of stabilizing an inverted pendulum; and (ii) a constrained queueing task show that our improper policy optimization algorithm can stabilize the system even when the base policies at its disposal are unstable.}\n}", "pdf": "https://proceedings.mlr.press/v162/zaki22a/zaki22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/zaki22a-supp.zip", "pdf_size": 1088726, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3035641118035383996&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of ECE, IISc, Bangalore, India; Boston University, Massachusetts, USA; Department of ECE, IISc, Bangalore, India; Faculty of Electrical Engineering, Technion, Haifa, Israel + NVIDIA Research, Israel", "aff_domain": "iisc.ac.in; ; ; ", "email": "iisc.ac.in; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zaki22a.html", "aff_unique_index": "0;1;0;2+3", "aff_unique_norm": "Indian Institute of Science;Boston University;Technion;NVIDIA", "aff_unique_dep": "Department of Electrical Communication Engineering;;Faculty of Electrical Engineering;Research", "aff_unique_url": "https://www.iisc.ac.in;https://www.bu.edu;https://www.technion.ac.il;https://research.nvidia.com", "aff_unique_abbr": "IISc;BU;Technion;NVIDIA", "aff_campus_unique_index": "0;1;0;2", "aff_campus_unique": "Bangalore;Massachusetts;Haifa;", "aff_country_unique_index": "0;1;0;2+2", "aff_country_unique": "India;United States;Israel" }, { "title": "AdAUC: End-to-end Adversarial AUC Optimization Against Long-tail Problems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17207", "id": "17207", "proceeding": "https://proceedings.mlr.press/v162/hou22b.html", "poster": "", "slides": "", "author_site": "Wenzheng Hou, Qianqian Xu, zhiyong yang, Shilong Bao, Yuan He, Qingming Huang", "author": "Wenzheng Hou; Qianqian Xu; Zhiyong Yang; Shilong Bao; Yuan He; Qingming Huang", "abstract": "It is well-known that deep learning models are vulnerable to adversarial examples. Existing studies of adversarial training have made great progress against this challenge. As a typical trait, they often assume that the class distribution is overall balanced. However, long-tail datasets are ubiquitous in a wide spectrum of applications, where the amount of head class instances is significantly larger than the tail classes. Under such a scenario, AUC is a much more reasonable metric than accuracy since it is insensitive toward class distribution. Motivated by this, we present an early trial to explore adversarial training methods to optimize AUC. The main challenge lies in that the positive and negative examples are tightly coupled in the objective function. As a direct result, one cannot generate adversarial examples without a full scan of the dataset. To address this issue, based on a concavity regularization scheme, we reformulate the AUC optimization problem as a saddle point problem, where the objective becomes an instance-wise function. This leads to an end-to-end training protocol. Furthermore, we provide a convergence guarantee of the proposed training algorithm. Our analysis differs from the existing studies since the algorithm is asked to generate adversarial examples by calculating the gradient of a min-max problem. Finally, the extensive experimental results show the performance and robustness of our algorithm in three long-tail datasets.", "bibtex": "@InProceedings{pmlr-v162-hou22b,\n title = \t {{A}d{AUC}: End-to-end Adversarial {AUC} Optimization Against Long-tail Problems},\n author = {Hou, Wenzheng and Xu, Qianqian and Yang, Zhiyong and Bao, Shilong and He, Yuan and Huang, Qingming},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8903--8925},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hou22b/hou22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/hou22b.html},\n abstract = \t {It is well-known that deep learning models are vulnerable to adversarial examples. Existing studies of adversarial training have made great progress against this challenge. As a typical trait, they often assume that the class distribution is overall balanced. However, long-tail datasets are ubiquitous in a wide spectrum of applications, where the amount of head class instances is significantly larger than the tail classes. Under such a scenario, AUC is a much more reasonable metric than accuracy since it is insensitive toward class distribution. Motivated by this, we present an early trial to explore adversarial training methods to optimize AUC. The main challenge lies in that the positive and negative examples are tightly coupled in the objective function. As a direct result, one cannot generate adversarial examples without a full scan of the dataset. To address this issue, based on a concavity regularization scheme, we reformulate the AUC optimization problem as a saddle point problem, where the objective becomes an instance-wise function. This leads to an end-to-end training protocol. Furthermore, we provide a convergence guarantee of the proposed training algorithm. Our analysis differs from the existing studies since the algorithm is asked to generate adversarial examples by calculating the gradient of a min-max problem. Finally, the extensive experimental results show the performance and robustness of our algorithm in three long-tail datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/hou22b/hou22b.pdf", "supp": "", "pdf_size": 4012371, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5564871801348267763&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/hou22b.html" }, { "title": "AdaGrad Avoids Saddle Points", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17717", "id": "17717", "proceeding": "https://proceedings.mlr.press/v162/antonakopoulos22a.html", "poster": "/media/PosterPDFs/ICML%202022/7ae26cbe9586dea7d1f0fa372aa86811.png?t=1657793502.3783457", "slides": "", "author_site": "Kimon Antonakopoulos, Panayotis Mertikopoulos, Georgios Piliouras, Xiao Wang", "author": "Kimon Antonakopoulos; Panayotis Mertikopoulos; Georgios Piliouras; Xiao Wang", "abstract": "Adaptive first-order methods in optimization have widespread ML applications due to their ability to adapt to non-convex landscapes. However, their convergence guarantees are typically stated in terms of vanishing gradient norms, which leaves open the issue of converging to undesirable saddle points (or even local maxima). In this paper, we focus on the AdaGrad family of algorithms - from scalar to full-matrix preconditioning - and we examine the question of whether the method\u2019s trajectories avoid saddle points. A major challenge that arises here is that AdaGrad\u2019s step-size (or, more accurately, the method\u2019s preconditioner) evolves over time in a filtration-dependent way, i.e., as a function of all gradients observed in earlier iterations; as a result, avoidance results for methods with a constant or vanishing step-size do not apply. We resolve this challenge by combining a series of step-size stabilization arguments with a recursive representation of the AdaGrad preconditioner that allows us to employ center-stable techniques and ultimately show that the induced trajectories avoid saddle points from almost any initial condition.", "bibtex": "@InProceedings{pmlr-v162-antonakopoulos22a,\n title = \t {{A}da{G}rad Avoids Saddle Points},\n author = {Antonakopoulos, Kimon and Mertikopoulos, Panayotis and Piliouras, Georgios and Wang, Xiao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {731--771},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/antonakopoulos22a/antonakopoulos22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/antonakopoulos22a.html},\n abstract = \t {Adaptive first-order methods in optimization have widespread ML applications due to their ability to adapt to non-convex landscapes. However, their convergence guarantees are typically stated in terms of vanishing gradient norms, which leaves open the issue of converging to undesirable saddle points (or even local maxima). In this paper, we focus on the AdaGrad family of algorithms - from scalar to full-matrix preconditioning - and we examine the question of whether the method\u2019s trajectories avoid saddle points. A major challenge that arises here is that AdaGrad\u2019s step-size (or, more accurately, the method\u2019s preconditioner) evolves over time in a filtration-dependent way, i.e., as a function of all gradients observed in earlier iterations; as a result, avoidance results for methods with a constant or vanishing step-size do not apply. We resolve this challenge by combining a series of step-size stabilization arguments with a recursive representation of the AdaGrad preconditioner that allows us to employ center-stable techniques and ultimately show that the induced trajectories avoid saddle points from almost any initial condition.}\n}", "pdf": "https://proceedings.mlr.press/v162/antonakopoulos22a/antonakopoulos22a.pdf", "supp": "", "pdf_size": 484068, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14291797889612910613&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Laboratory for Information and Inference Systems, IEM, STI, EPFL, 1015 Lausanne, Switzerland + Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LIG, 38000 Grenoble, France; Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LIG, 38000 Grenoble, France + Criteo AI Lab; Singapore University of Technology and Design; Shanghai University of Finance and Economics", "aff_domain": "epfl.ch;univ-grenoble-alpes.fr;sutd.edu.sg;sufe.edu.cn", "email": "epfl.ch;univ-grenoble-alpes.fr;sutd.edu.sg;sufe.edu.cn", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/antonakopoulos22a.html", "aff_unique_index": "0+1;1+2;3;4", "aff_unique_norm": "EPFL;Universite Grenoble Alpes;Criteo;Singapore University of Technology and Design;Shanghai University of Finance and Economics", "aff_unique_dep": "Laboratory for Information and Inference Systems;;Criteo AI Lab;;", "aff_unique_url": "https://www.epfl.ch;https://www.univ-grenoble-alpes.fr;https://www.criteo.com;https://www.sutd.edu.sg;http://www.sufe.edu.cn", "aff_unique_abbr": "EPFL;UGA;Criteo;SUTD;SUFE", "aff_campus_unique_index": "0+1;1", "aff_campus_unique": "Lausanne;Grenoble;", "aff_country_unique_index": "0+1;1+1;2;3", "aff_country_unique": "Switzerland;France;Singapore;China" }, { "title": "Adapting k-means Algorithms for Outliers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15985", "id": "15985", "proceeding": "https://proceedings.mlr.press/v162/grunau22a.html", "poster": "", "slides": "", "author_site": "Christoph Grunau, Vaclav Rozhon", "author": "Christoph Grunau; V\u00e1clav Rozho\u0148", "abstract": "This paper shows how to adapt several simple and classical sampling-based algorithms for the k-means problem to the setting with outliers. Recently, Bhaskara et al. (NeurIPS 2019) showed how to adapt the classical k-means++ algorithm to the setting with outliers. However, their algorithm needs to output O(log(k)$\\cdot$z) outliers, where z is the number of true outliers, to match the O(log k)-approximation guarantee of k-means++. In this paper, we build on their ideas and show how to adapt several sequential and distributed k-means algorithms to the setting with outliers, but with substantially stronger theoretical guarantees: our algorithms output (1 + $\\epsilon$)z outliers while achieving an O(1/$\\epsilon$)-approximation to the objective function. In the sequential world, we achieve this by adapting a recent algorithm of Lattanzi and Sohler (ICML 2019). In the distributed setting, we adapt a simple algorithm of Guha et al. (IEEE Trans. Know. and Data Engineering 2003) and the popular k-means\\|{of} Bahmani et al. (PVLDB2012). A theoretical application of our techniques is an algorithm with running time O(nk^2/z) that achieves an O(1)-approximation to the objective function while outputting O(z) outliers, assuming k << z << n. This is complemented with a matching lower bound of $\\Omega$(nk^2/z) for this problem in the oracle model.", "bibtex": "@InProceedings{pmlr-v162-grunau22a,\n title = \t {Adapting k-means Algorithms for Outliers},\n author = {Grunau, Christoph and Rozho{\\v{n}}, V{\\'a}clav},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7845--7886},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/grunau22a/grunau22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/grunau22a.html},\n abstract = \t {This paper shows how to adapt several simple and classical sampling-based algorithms for the k-means problem to the setting with outliers. Recently, Bhaskara et al. (NeurIPS 2019) showed how to adapt the classical k-means++ algorithm to the setting with outliers. However, their algorithm needs to output O(log(k)$\\cdot$z) outliers, where z is the number of true outliers, to match the O(log k)-approximation guarantee of k-means++. In this paper, we build on their ideas and show how to adapt several sequential and distributed k-means algorithms to the setting with outliers, but with substantially stronger theoretical guarantees: our algorithms output (1 + $\\epsilon$)z outliers while achieving an O(1/$\\epsilon$)-approximation to the objective function. In the sequential world, we achieve this by adapting a recent algorithm of Lattanzi and Sohler (ICML 2019). In the distributed setting, we adapt a simple algorithm of Guha et al. (IEEE Trans. Know. and Data Engineering 2003) and the popular k-means\\|{of} Bahmani et al. (PVLDB2012). A theoretical application of our techniques is an algorithm with running time O(nk^2/z) that achieves an O(1)-approximation to the objective function while outputting O(z) outliers, assuming k << z << n. This is complemented with a matching lower bound of $\\Omega$(nk^2/z) for this problem in the oracle model.}\n}", "pdf": "https://proceedings.mlr.press/v162/grunau22a/grunau22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/grunau22a-supp.zip", "pdf_size": 804539, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17674053106923968450&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "ETH Z \u00a8urich; ETH Z \u00a8urich", "aff_domain": "ethz.ch;ethz.ch", "email": "ethz.ch;ethz.ch", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/grunau22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Z\u00fcrich", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Adapting the Linearised Laplace Model Evidence for Modern Deep Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18289", "id": "18289", "proceeding": "https://proceedings.mlr.press/v162/antoran22a.html", "poster": "/media/PosterPDFs/ICML%202022/de58bfe3d33dada41a9398c30e21eeed.png?t=1658065451.0177236", "slides": "", "author_site": "Javier Antor\u00e1n, David Janz, James Allingham, Erik Daxberger, Riccardo Barbano, Eric Nalisnick, Jose Miguel Hernandez-Lobato", "author": "Javier Antoran; David Janz; James U Allingham; Erik Daxberger; Riccardo Rb Barbano; Eric Nalisnick; Jose Miguel Hernandez-Lobato", "abstract": "The linearised Laplace method for estimating model uncertainty has received renewed attention in the Bayesian deep learning community. The method provides reliable error bars and admits a closed-form expression for the model evidence, allowing for scalable selection of model hyperparameters. In this work, we examine the assumptions behind this method, particularly in conjunction with model selection. We show that these interact poorly with some now-standard tools of deep learning\u2013stochastic approximation methods and normalisation layers\u2013and make recommendations for how to better adapt this classic method to the modern setting. We provide theoretical support for our recommendations and validate them empirically on MLPs, classic CNNs, residual networks with and without normalisation layers, generative autoencoders and transformers.", "bibtex": "@InProceedings{pmlr-v162-antoran22a,\n title = \t {Adapting the Linearised {L}aplace Model Evidence for Modern Deep Learning},\n author = {Antoran, Javier and Janz, David and Allingham, James U and Daxberger, Erik and Barbano, Riccardo Rb and Nalisnick, Eric and Hernandez-Lobato, Jose Miguel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {796--821},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/antoran22a/antoran22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/antoran22a.html},\n abstract = \t {The linearised Laplace method for estimating model uncertainty has received renewed attention in the Bayesian deep learning community. The method provides reliable error bars and admits a closed-form expression for the model evidence, allowing for scalable selection of model hyperparameters. In this work, we examine the assumptions behind this method, particularly in conjunction with model selection. We show that these interact poorly with some now-standard tools of deep learning\u2013stochastic approximation methods and normalisation layers\u2013and make recommendations for how to better adapt this classic method to the modern setting. We provide theoretical support for our recommendations and validate them empirically on MLPs, classic CNNs, residual networks with and without normalisation layers, generative autoencoders and transformers.}\n}", "pdf": "https://proceedings.mlr.press/v162/antoran22a/antoran22a.pdf", "supp": "", "pdf_size": 2772228, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3588379385157935270&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/antoran22a.html" }, { "title": "Adapting to Mixing Time in Stochastic Optimization with Markovian Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16231", "id": "16231", "proceeding": "https://proceedings.mlr.press/v162/dorfman22a.html", "poster": "/media/PosterPDFs/ICML%202022/11108a3dbfe4636cb40b84b803b2fff6.png?t=1657255503.6389086", "slides": "", "author_site": "Ron Dorfman, Kfir Levy", "author": "Ron Dorfman; Kfir Yehuda Levy", "abstract": "We consider stochastic optimization problems where data is drawn from a Markov chain. Existing methods for this setting crucially rely on knowing the mixing time of the chain, which in real-world applications is usually unknown. We propose the first optimization method that does not require the knowledge of the mixing time, yet obtains the optimal asymptotic convergence rate when applied to convex problems. We further show that our approach can be extended to: (i) finding stationary points in non-convex optimization with Markovian data, and (ii) obtaining better dependence on the mixing time in temporal difference (TD) learning; in both cases, our method is completely oblivious to the mixing time. Our method relies on a novel combination of multi-level Monte Carlo (MLMC) gradient estimation together with an adaptive learning method.", "bibtex": "@InProceedings{pmlr-v162-dorfman22a,\n title = \t {Adapting to Mixing Time in Stochastic Optimization with {M}arkovian Data},\n author = {Dorfman, Ron and Levy, Kfir Yehuda},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5429--5446},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dorfman22a/dorfman22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dorfman22a.html},\n abstract = \t {We consider stochastic optimization problems where data is drawn from a Markov chain. Existing methods for this setting crucially rely on knowing the mixing time of the chain, which in real-world applications is usually unknown. We propose the first optimization method that does not require the knowledge of the mixing time, yet obtains the optimal asymptotic convergence rate when applied to convex problems. We further show that our approach can be extended to: (i) finding stationary points in non-convex optimization with Markovian data, and (ii) obtaining better dependence on the mixing time in temporal difference (TD) learning; in both cases, our method is completely oblivious to the mixing time. Our method relies on a novel combination of multi-level Monte Carlo (MLMC) gradient estimation together with an adaptive learning method.}\n}", "pdf": "https://proceedings.mlr.press/v162/dorfman22a/dorfman22a.pdf", "supp": "", "pdf_size": 1243569, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4133641935390571413&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Viterby Faculty of Electrical and Computer Engineering, Technion, Haifa, Israel; Viterby Faculty of Electrical and Computer Engineering, Technion, Haifa, Israel + A Viterbi Fellow", "aff_domain": "campus.technion.ac.il; ", "email": "campus.technion.ac.il; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/dorfman22a.html", "aff_unique_index": "0;0+1", "aff_unique_norm": "Technion;University of Southern California", "aff_unique_dep": "Viterby Faculty of Electrical and Computer Engineering;Viterbi School of Engineering", "aff_unique_url": "https://www.technion.ac.il;https://viterbi.usc.edu", "aff_unique_abbr": "Technion;USC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Haifa;", "aff_country_unique_index": "0;0+1", "aff_country_unique": "Israel;United States" }, { "title": "Adaptive Accelerated (Extra-)Gradient Methods with Variance Reduction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16531", "id": "16531", "proceeding": "https://proceedings.mlr.press/v162/liu22o.html", "poster": "/media/PosterPDFs/ICML%202022/21f4c3b5591da245af90a2fd52fa1a55_ffGCLrj.png?t=1657432114.400814", "slides": "", "author_site": "Zijian Liu, Ta Duy Nguyen, Alina Ene, Huy Nguyen", "author": "Zijian Liu; Ta Duy Nguyen; Alina Ene; Huy Nguyen", "abstract": "In this paper, we study the finite-sum convex optimization problem focusing on the general convex case. Recently, the study of variance reduced (VR) methods and their accelerated variants has made exciting progress. However, the step size used in the existing VR algorithms typically depends on the smoothness parameter, which is often unknown and requires tuning in practice. To address this problem, we propose two novel adaptive VR algorithms:", "bibtex": "@InProceedings{pmlr-v162-liu22o,\n title = \t {Adaptive Accelerated ({E}xtra-){G}radient Methods with Variance Reduction},\n author = {Liu, Zijian and Nguyen, Ta Duy and Ene, Alina and Nguyen, Huy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13947--13994},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22o/liu22o.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22o.html},\n abstract = \t {In this paper, we study the finite-sum convex optimization problem focusing on the general convex case. Recently, the study of variance reduced (VR) methods and their accelerated variants has made exciting progress. However, the step size used in the existing VR algorithms typically depends on the smoothness parameter, which is often unknown and requires tuning in practice. To address this problem, we propose two novel adaptive VR algorithms:", "pdf": "https://proceedings.mlr.press/v162/liu22o/liu22o.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/liu22o-supp.zip", "pdf_size": 1006693, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6692053373814692949&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Boston University; Department of Computer Science, Boston University; Department of Computer Science, Boston University; Khoury College of Computer and Information Science, Northeastern University", "aff_domain": "bu.edu; ; ; ", "email": "bu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/liu22o.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Boston University;Northeastern University", "aff_unique_dep": "Department of Computer Science;Khoury College of Computer and Information Science", "aff_unique_url": "https://www.bu.edu;https://www.northeastern.edu", "aff_unique_abbr": "BU;NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Adaptive Best-of-Both-Worlds Algorithm for Heavy-Tailed Multi-Armed Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16609", "id": "16609", "proceeding": "https://proceedings.mlr.press/v162/huang22c.html", "poster": "", "slides": "", "author_site": "Jiatai Huang, Yan Dai, Longbo Huang", "author": "Jiatai Huang; Yan Dai; Longbo Huang", "abstract": "In this paper, we generalize the concept of heavy-tailed multi-armed bandits to adversarial environments, and develop robust best-of-both-worlds algorithms for heavy-tailed multi-armed bandits (MAB), where losses have $\\alpha$-th ($1<\\alpha\\le 2$) moments bounded by $\\sigma^\\alpha$, while the variances may not exist. Specifically, we design an algorithm \\texttt{HTINF}, when the heavy-tail parameters $\\alpha$ and $\\sigma$ are known to the agent, \\texttt{HTINF} simultaneously achieves the optimal regret for both stochastic and adversarial environments, without knowing the actual environment type a-priori. When $\\alpha,\\sigma$ are unknown, \\texttt{HTINF} achieves a $\\log T$-style instance-dependent regret in stochastic cases and $o(T)$ no-regret guarantee in adversarial cases. We further develop an algorithm \\texttt{AdaTINF}, achieving $\\mathcal O(\\sigma K^{1-\\nicefrac 1\\alpha}T^{\\nicefrac{1}{\\alpha}})$ minimax optimal regret even in adversarial settings, without prior knowledge on $\\alpha$ and $\\sigma$. This result matches the known regret lower-bound (Bubeck et al., 2013), which assumed a stochastic environment and $\\alpha$ and $\\sigma$ are both known. To our knowledge, the proposed \\texttt{HTINF} algorithm is the first to enjoy a best-of-both-worlds regret guarantee, and \\texttt{AdaTINF} is the first algorithm that can adapt to both $\\alpha$ and $\\sigma$ to achieve optimal gap-indepedent regret bound in classical heavy-tailed stochastic MAB setting and our novel adversarial formulation.", "bibtex": "@InProceedings{pmlr-v162-huang22c,\n title = \t {Adaptive Best-of-Both-Worlds Algorithm for Heavy-Tailed Multi-Armed Bandits},\n author = {Huang, Jiatai and Dai, Yan and Huang, Longbo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9173--9200},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22c/huang22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22c.html},\n abstract = \t {In this paper, we generalize the concept of heavy-tailed multi-armed bandits to adversarial environments, and develop robust best-of-both-worlds algorithms for heavy-tailed multi-armed bandits (MAB), where losses have $\\alpha$-th ($1<\\alpha\\le 2$) moments bounded by $\\sigma^\\alpha$, while the variances may not exist. Specifically, we design an algorithm \\texttt{HTINF}, when the heavy-tail parameters $\\alpha$ and $\\sigma$ are known to the agent, \\texttt{HTINF} simultaneously achieves the optimal regret for both stochastic and adversarial environments, without knowing the actual environment type a-priori. When $\\alpha,\\sigma$ are unknown, \\texttt{HTINF} achieves a $\\log T$-style instance-dependent regret in stochastic cases and $o(T)$ no-regret guarantee in adversarial cases. We further develop an algorithm \\texttt{AdaTINF}, achieving $\\mathcal O(\\sigma K^{1-\\nicefrac 1\\alpha}T^{\\nicefrac{1}{\\alpha}})$ minimax optimal regret even in adversarial settings, without prior knowledge on $\\alpha$ and $\\sigma$. This result matches the known regret lower-bound (Bubeck et al., 2013), which assumed a stochastic environment and $\\alpha$ and $\\sigma$ are both known. To our knowledge, the proposed \\texttt{HTINF} algorithm is the first to enjoy a best-of-both-worlds regret guarantee, and \\texttt{AdaTINF} is the first algorithm that can adapt to both $\\alpha$ and $\\sigma$ to achieve optimal gap-indepedent regret bound in classical heavy-tailed stochastic MAB setting and our novel adversarial formulation.}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22c/huang22c.pdf", "supp": "", "pdf_size": 475874, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5283979312300389359&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, China; Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, China; Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, China", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/huang22c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "Tsinghua", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Adaptive Conformal Predictions for Time Series", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17817", "id": "17817", "proceeding": "https://proceedings.mlr.press/v162/zaffran22a.html", "poster": "/media/PosterPDFs/ICML%202022/b59442085644532ef03417a3e5a76437_ShcNJIg.png?t=1657807820.8791652", "slides": "/media/icml-2022/Slides/17817_45KQty1.pdf", "author_site": "Margaux Zaffran, Olivier FERON, Yannig Goude, julie Josse, Aymeric Dieuleveut", "author": "Margaux Zaffran; Olivier Feron; Yannig Goude; Julie Josse; Aymeric Dieuleveut", "abstract": "Uncertainty quantification of predictive models is crucial in decision-making problems. Conformal prediction is a general and theoretically sound answer. However, it requires exchangeable data, excluding time series. While recent works tackled this issue, we argue that Adaptive Conformal Inference (ACI, Gibbs & Cand{\u00e8}s, 2021), developed for distribution-shift time series, is a good procedure for time series with general dependency. We theoretically analyse the impact of the learning rate on its efficiency in the exchangeable and auto-regressive case. We propose a parameter-free method, AgACI, that adaptively builds upon ACI based on online expert aggregation. We lead extensive fair simulations against competing methods that advocate for ACI\u2019s use in time series. We conduct a real case study: electricity price forecasting. The proposed aggregation algorithm provides efficient prediction intervals for day-ahead forecasting. All the code and data to reproduce the experiments are made available on GitHub.", "bibtex": "@InProceedings{pmlr-v162-zaffran22a,\n title = \t {Adaptive Conformal Predictions for Time Series},\n author = {Zaffran, Margaux and Feron, Olivier and Goude, Yannig and Josse, Julie and Dieuleveut, Aymeric},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25834--25866},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zaffran22a/zaffran22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zaffran22a.html},\n abstract = \t {Uncertainty quantification of predictive models is crucial in decision-making problems. Conformal prediction is a general and theoretically sound answer. However, it requires exchangeable data, excluding time series. While recent works tackled this issue, we argue that Adaptive Conformal Inference (ACI, Gibbs & Cand{\u00e8}s, 2021), developed for distribution-shift time series, is a good procedure for time series with general dependency. We theoretically analyse the impact of the learning rate on its efficiency in the exchangeable and auto-regressive case. We propose a parameter-free method, AgACI, that adaptively builds upon ACI based on online expert aggregation. We lead extensive fair simulations against competing methods that advocate for ACI\u2019s use in time series. We conduct a real case study: electricity price forecasting. The proposed aggregation algorithm provides efficient prediction intervals for day-ahead forecasting. All the code and data to reproduce the experiments are made available on GitHub.}\n}", "pdf": "https://proceedings.mlr.press/v162/zaffran22a/zaffran22a.pdf", "supp": "", "pdf_size": 1416052, "gs_citation": 175, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6242332424381793143&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 19, "aff": "Electricit\u00e9 de France R&D, Palaiseau, France+INRIA Sophia-Antipolis, Montpellier, France+CMAP, Ecole Polytechnique, Institut Polytechnique de Paris, Palaiseau, France; Electricit\u00e9 de France R&D, Palaiseau, France+FiME, Universit\u00e9 Paris-Dauphine, France+LMO, Universit\u00e9 Paris-Saclay, Orsay, France; Electricit\u00e9 de France R&D, Palaiseau, France+LMO, Universit\u00e9 Paris-Saclay, Orsay, France; INRIA Sophia-Antipolis, Montpellier, France+IDESP, Montpellier, France; CMAP, Ecole Polytechnique, Institut Polytechnique de Paris, Palaiseau, France", "aff_domain": "inria.fr; ; ; ; ", "email": "inria.fr; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zaffran22a.html", "aff_unique_index": "0+1+2;0+3+4;0+4;1+5;2", "aff_unique_norm": "Electricit\u00e9 de France;INRIA;Ecole Polytechnique;Universit\u00e9 Paris-Dauphine;Universit\u00e9 Paris-Saclay;Institut de Recherche pour le Developpement", "aff_unique_dep": "R&D;;CMAP;FiME;LMO;", "aff_unique_url": "https://www.edf.com;https://www.inria.fr;https://www.ec-polytechnique.fr;https://www.univ-paris-dauphine.fr;https://www.universite-paris-saclay.fr;https://www.ird.fr", "aff_unique_abbr": "EDF;INRIA;Polytechnique;;;IDESP", "aff_campus_unique_index": "0+1+0;0+3;0+3;1+4;0", "aff_campus_unique": "Palaiseau;Sophia-Antipolis;;Orsay;Montpellier", "aff_country_unique_index": "0+0+0;0+0+0;0+0;0+0;0", "aff_country_unique": "France" }, { "title": "Adaptive Data Analysis with Correlated Observations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16033", "id": "16033", "proceeding": "https://proceedings.mlr.press/v162/kontorovich22a.html", "poster": "/media/PosterPDFs/ICML%202022/3bd4017318837e92a66298c7855f4427.png?t=1657707766.7645192", "slides": "", "author_site": "Aryeh Kontorovich, Menachem Sadigurschi, Uri Stemmer", "author": "Aryeh Kontorovich; Menachem Sadigurschi; Uri Stemmer", "abstract": "The vast majority of the work on adaptive data analysis focuses on the case where the samples in the dataset are independent. Several approaches and tools have been successfully applied in this context, such as", "bibtex": "@InProceedings{pmlr-v162-kontorovich22a,\n title = \t {Adaptive Data Analysis with Correlated Observations},\n author = {Kontorovich, Aryeh and Sadigurschi, Menachem and Stemmer, Uri},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11483--11498},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kontorovich22a/kontorovich22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kontorovich22a.html},\n abstract = \t {The vast majority of the work on adaptive data analysis focuses on the case where the samples in the dataset are independent. Several approaches and tools have been successfully applied in this context, such as", "pdf": "https://proceedings.mlr.press/v162/kontorovich22a/kontorovich22a.pdf", "supp": "", "pdf_size": 388325, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8994215029178811503&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, Ben-Gurion University; Department of Computer Science, Ben-Gurion University; Blavatnik School of Computer Science, Tel Aviv University + Google Research", "aff_domain": "post.bgu.ac.il;post.bgu.ac.il;tau.ac.il", "email": "post.bgu.ac.il;post.bgu.ac.il;tau.ac.il", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kontorovich22a.html", "aff_unique_index": "0;0;1+2", "aff_unique_norm": "Ben-Gurion University;Tel Aviv University;Google", "aff_unique_dep": "Department of Computer Science;Blavatnik School of Computer Science;Google Research", "aff_unique_url": "https://www.bgu.ac.il;https://www.tau.ac.il;https://research.google", "aff_unique_abbr": "BGU;TAU;Google Research", "aff_campus_unique_index": "1+2", "aff_campus_unique": ";Tel Aviv;Mountain View", "aff_country_unique_index": "0;0;0+1", "aff_country_unique": "Israel;United States" }, { "title": "Adaptive Gaussian Process Change Point Detection", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17217", "id": "17217", "proceeding": "https://proceedings.mlr.press/v162/caldarelli22a.html", "poster": "/media/PosterPDFs/ICML%202022/bf2fb7d1825a1df3ca308ad0bf48591e_OSf75R8.png?t=1657631149.3931034", "slides": "", "author_site": "Edoardo Caldarelli, Philippe Wenk, Stefan Bauer, Andreas Krause", "author": "Edoardo Caldarelli; Philippe Wenk; Stefan Bauer; Andreas Krause", "abstract": "Detecting change points in time series, i.e., points in time at which some observed process suddenly changes, is a fundamental task that arises in many real-world applications, with consequences for safety and reliability. In this work, we propose ADAGA, a novel Gaussian process-based solution to this problem, that leverages a powerful heuristics we developed based on statistical hypothesis testing. In contrast to prior approaches, ADAGA adapts to changes both in mean and covariance structure of the temporal process. In extensive experiments, we show its versatility and applicability to different classes of change points, demonstrating that it is significantly more accurate than current state-of-the-art alternatives.", "bibtex": "@InProceedings{pmlr-v162-caldarelli22a,\n title = \t {Adaptive {G}aussian Process Change Point Detection},\n author = {Caldarelli, Edoardo and Wenk, Philippe and Bauer, Stefan and Krause, Andreas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2542--2571},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/caldarelli22a/caldarelli22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/caldarelli22a.html},\n abstract = \t {Detecting change points in time series, i.e., points in time at which some observed process suddenly changes, is a fundamental task that arises in many real-world applications, with consequences for safety and reliability. In this work, we propose ADAGA, a novel Gaussian process-based solution to this problem, that leverages a powerful heuristics we developed based on statistical hypothesis testing. In contrast to prior approaches, ADAGA adapts to changes both in mean and covariance structure of the temporal process. In extensive experiments, we show its versatility and applicability to different classes of change points, demonstrating that it is significantly more accurate than current state-of-the-art alternatives.}\n}", "pdf": "https://proceedings.mlr.press/v162/caldarelli22a/caldarelli22a.pdf", "supp": "", "pdf_size": 4500226, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7238806589069309136&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Institut de Rob `otica i Inform `atica Industrial, CSIC-UPC, Barcelona, Spain+ETH Zurich; Department of Computer Science, ETH Zurich, Zurich, Switzerland; Department of Intelligent Systems, KTH, Stockholm, Sweden; Department of Computer Science, ETH Zurich, Zurich, Switzerland", "aff_domain": "iri.upc.edu; ; ; ", "email": "iri.upc.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/caldarelli22a.html", "aff_unique_index": "0+1;1;2;1", "aff_unique_norm": "Institut de Rob\u00f2tica i Inform\u00e0tica Industrial;ETH Zurich;KTH Royal Institute of Technology", "aff_unique_dep": "CSIC-UPC;;Department of Intelligent Systems", "aff_unique_url": "http://www.iri.upc.edu/;https://www.ethz.ch;https://www.kth.se", "aff_unique_abbr": "IRI;ETHZ;KTH", "aff_campus_unique_index": "0;2;3;2", "aff_campus_unique": "Barcelona;;Zurich;Stockholm", "aff_country_unique_index": "0+1;1;2;1", "aff_country_unique": "Spain;Switzerland;Sweden" }, { "title": "Adaptive Inertia: Disentangling the Effects of Adaptive Learning Rate and Momentum", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17065", "id": "17065", "proceeding": "https://proceedings.mlr.press/v162/xie22d.html", "poster": "/media/PosterPDFs/ICML%202022/217e342fc01668b10cb1188d40d3370e_NAMtL27.png?t=1656516238.8300333", "slides": "/media/icml-2022/Slides/17065.pdf", "author_site": "Zeke Xie, Xinrui Wang, Huishuai Zhang, Issei Sato, Masashi Sugiyama", "author": "Zeke Xie; Xinrui Wang; Huishuai Zhang; Issei Sato; Masashi Sugiyama", "abstract": "Adaptive Moment Estimation (Adam), which combines Adaptive Learning Rate and Momentum, would be the most popular stochastic optimizer for accelerating the training of deep neural networks. However, it is empirically known that Adam often generalizes worse than Stochastic Gradient Descent (SGD). The purpose of this paper is to unveil the mystery of this behavior in the diffusion theoretical framework. Specifically, we disentangle the effects of Adaptive Learning Rate and Momentum of the Adam dynamics on saddle-point escaping and flat minima selection. We prove that Adaptive Learning Rate can escape saddle points efficiently, but cannot select flat minima as SGD does. In contrast, Momentum provides a drift effect to help the training process pass through saddle points, and almost does not affect flat minima selection. This partly explains why SGD (with Momentum) generalizes better, while Adam generalizes worse but converges faster. Furthermore, motivated by the analysis, we design a novel adaptive optimization framework named Adaptive Inertia, which uses parameter-wise adaptive inertia to accelerate the training and provably favors flat minima as well as SGD. Our extensive experiments demonstrate that the proposed adaptive inertia method can generalize significantly better than SGD and conventional adaptive gradient methods.", "bibtex": "@InProceedings{pmlr-v162-xie22d,\n title = \t {Adaptive Inertia: Disentangling the Effects of Adaptive Learning Rate and Momentum},\n author = {Xie, Zeke and Wang, Xinrui and Zhang, Huishuai and Sato, Issei and Sugiyama, Masashi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24430--24459},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xie22d/xie22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/xie22d.html},\n abstract = \t {Adaptive Moment Estimation (Adam), which combines Adaptive Learning Rate and Momentum, would be the most popular stochastic optimizer for accelerating the training of deep neural networks. However, it is empirically known that Adam often generalizes worse than Stochastic Gradient Descent (SGD). The purpose of this paper is to unveil the mystery of this behavior in the diffusion theoretical framework. Specifically, we disentangle the effects of Adaptive Learning Rate and Momentum of the Adam dynamics on saddle-point escaping and flat minima selection. We prove that Adaptive Learning Rate can escape saddle points efficiently, but cannot select flat minima as SGD does. In contrast, Momentum provides a drift effect to help the training process pass through saddle points, and almost does not affect flat minima selection. This partly explains why SGD (with Momentum) generalizes better, while Adam generalizes worse but converges faster. Furthermore, motivated by the analysis, we design a novel adaptive optimization framework named Adaptive Inertia, which uses parameter-wise adaptive inertia to accelerate the training and provably favors flat minima as well as SGD. Our extensive experiments demonstrate that the proposed adaptive inertia method can generalize significantly better than SGD and conventional adaptive gradient methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/xie22d/xie22d.pdf", "supp": "", "pdf_size": 2253276, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13927749267930732278&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "The University of Tokyo; The University of Tokyo; Microsoft Research Asia; The University of Tokyo; RIKEN Center for AIP", "aff_domain": "gmail.com; ; ; ; ", "email": "gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/xie22d.html", "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "University of Tokyo;Microsoft;RIKEN", "aff_unique_dep": ";Research;Center for AIP", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.microsoft.com/en-us/research/group/asia;https://www.riken.jp", "aff_unique_abbr": "UTokyo;MSR Asia;RIKEN", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "Japan;China" }, { "title": "Adaptive Model Design for Markov Decision Process", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18317", "id": "18317", "proceeding": "https://proceedings.mlr.press/v162/chen22ab.html", "poster": "/media/PosterPDFs/ICML%202022/dc9fa5f217a1e57b8a6adeb065560b38.png?t=1657794719.6299121", "slides": "", "author_site": "Siyu Chen, Donglin Yang, Jiayang Li, Senmiao Wang, Zhuoran Yang, Zhaoran Wang", "author": "Siyu Chen; Donglin Yang; Jiayang Li; Senmiao Wang; Zhuoran Yang; Zhaoran Wang", "abstract": "In a Markov decision process (MDP), an agent interacts with the environment via perceptions and actions. During this process, the agent aims to maximize its own gain. Hence, appropriate regulations are often required, if we hope to take the external costs/benefits of its actions into consideration. In this paper, we study how to regulate such an agent by redesigning model parameters that can affect the rewards and/or the transition kernels. We formulate this problem as a bilevel program, in which the lower-level MDP is regulated by the upper-level model designer. To solve the resulting problem, we develop a scheme that allows the designer to iteratively predict the agent\u2019s reaction by solving the MDP and then adaptively update model parameters based on the predicted reaction. The algorithm is first theoretically analyzed and then empirically tested on several MDP models arising in economics and robotics.", "bibtex": "@InProceedings{pmlr-v162-chen22ab,\n title = \t {Adaptive Model Design for {M}arkov Decision Process},\n author = {Chen, Siyu and Yang, Donglin and Li, Jiayang and Wang, Senmiao and Yang, Zhuoran and Wang, Zhaoran},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3679--3700},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22ab/chen22ab.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22ab.html},\n abstract = \t {In a Markov decision process (MDP), an agent interacts with the environment via perceptions and actions. During this process, the agent aims to maximize its own gain. Hence, appropriate regulations are often required, if we hope to take the external costs/benefits of its actions into consideration. In this paper, we study how to regulate such an agent by redesigning model parameters that can affect the rewards and/or the transition kernels. We formulate this problem as a bilevel program, in which the lower-level MDP is regulated by the upper-level model designer. To solve the resulting problem, we develop a scheme that allows the designer to iteratively predict the agent\u2019s reaction by solving the MDP and then adaptively update model parameters based on the predicted reaction. The algorithm is first theoretically analyzed and then empirically tested on several MDP models arising in economics and robotics.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22ab/chen22ab.pdf", "supp": "", "pdf_size": 462520, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1836348513814518637&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Tsinghua University; Tsinghua University; Northwestern University; Northwestern University; Yale University; Northwestern University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;northwestern.edu;northwestern.edu;yale.edu;gmail.com", "email": "tsinghua.edu.cn;tsinghua.edu.cn;northwestern.edu;northwestern.edu;yale.edu;gmail.com", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/chen22ab.html", "aff_unique_index": "0;0;1;1;2;1", "aff_unique_norm": "Tsinghua University;Northwestern University;Yale University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.northwestern.edu;https://www.yale.edu", "aff_unique_abbr": "THU;NU;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;1;1", "aff_country_unique": "China;United States" }, { "title": "Adaptive Random Walk Gradient Descent for Decentralized Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16457", "id": "16457", "proceeding": "https://proceedings.mlr.press/v162/sun22b.html", "poster": "/media/PosterPDFs/ICML%202022/309fee4e541e51de2e41f21bebb342aa.png?t=1657176588.417864", "slides": "", "author_site": "Tao Sun, Dongsheng Li, Bao Wang", "author": "Tao Sun; Dongsheng Li; Bao Wang", "abstract": "In this paper, we study the adaptive step size random walk gradient descent with momentum for decentralized optimization, in which the training samples are drawn dependently with each other. We establish theoretical convergence rates of the adaptive step size random walk gradient descent with momentum for both convex and nonconvex settings. In particular, we prove that adaptive random walk algorithms perform as well as the non-adaptive method for dependent data in general cases but achieve acceleration when the stochastic gradients are \u201csparse\u201d. Moreover, we study the zeroth-order version of adaptive random walk gradient descent and provide corresponding convergence results. All assumptions used in this paper are mild and general, making our results applicable to many machine learning problems.", "bibtex": "@InProceedings{pmlr-v162-sun22b,\n title = \t {Adaptive Random Walk Gradient Descent for Decentralized Optimization},\n author = {Sun, Tao and Li, Dongsheng and Wang, Bao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20790--20809},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sun22b/sun22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/sun22b.html},\n abstract = \t {In this paper, we study the adaptive step size random walk gradient descent with momentum for decentralized optimization, in which the training samples are drawn dependently with each other. We establish theoretical convergence rates of the adaptive step size random walk gradient descent with momentum for both convex and nonconvex settings. In particular, we prove that adaptive random walk algorithms perform as well as the non-adaptive method for dependent data in general cases but achieve acceleration when the stochastic gradients are \u201csparse\u201d. Moreover, we study the zeroth-order version of adaptive random walk gradient descent and provide corresponding convergence results. All assumptions used in this paper are mild and general, making our results applicable to many machine learning problems.}\n}", "pdf": "https://proceedings.mlr.press/v162/sun22b/sun22b.pdf", "supp": "", "pdf_size": 2281082, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17763475743826749137&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": ";;", "aff_domain": ";;", "email": ";;", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sun22b.html" }, { "title": "Adaptive Second Order Coresets for Data-efficient Machine Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16583", "id": "16583", "proceeding": "https://proceedings.mlr.press/v162/pooladzandi22a.html", "poster": "/media/PosterPDFs/ICML%202022/b4d6f2b565ca0eef1f9245403aac366a.png?t=1658014108.0316088", "slides": "", "author_site": "Omead Pooladzandi, David Davini, Baharan Mirzasoleiman", "author": "Omead Pooladzandi; David Davini; Baharan Mirzasoleiman", "abstract": "Training machine learning models on massive datasets incurs substantial computational costs. To alleviate such costs, there has been a sustained effort to develop data-efficient training methods that can carefully select subsets of the training examples that generalize on par with the full training data. However, existing methods are limited in providing theoretical guarantees for the quality of the models trained on the extracted subsets, and may perform poorly in practice. We propose AdaCore, a method that leverages the geometry of the data to extract subsets of the training examples for efficient machine learning. The key idea behind our method is to dynamically approximate the curvature of the loss function via an exponentially-averaged estimate of the Hessian to select weighted subsets (coresets) that provide a close approximation of the full gradient preconditioned with the Hessian. We prove rigorous guarantees for the convergence of various first and second-order methods applied to the subsets chosen by AdaCore. Our extensive experiments show that AdaCore extracts coresets with higher quality compared to baselines and speeds up training of convex and non-convex machine learning models, such as logistic regression and neural networks, by over 2.9x over the full data and 4.5x over random subsets.", "bibtex": "@InProceedings{pmlr-v162-pooladzandi22a,\n title = \t {Adaptive Second Order Coresets for Data-efficient Machine Learning},\n author = {Pooladzandi, Omead and Davini, David and Mirzasoleiman, Baharan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17848--17869},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pooladzandi22a/pooladzandi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pooladzandi22a.html},\n abstract = \t {Training machine learning models on massive datasets incurs substantial computational costs. To alleviate such costs, there has been a sustained effort to develop data-efficient training methods that can carefully select subsets of the training examples that generalize on par with the full training data. However, existing methods are limited in providing theoretical guarantees for the quality of the models trained on the extracted subsets, and may perform poorly in practice. We propose AdaCore, a method that leverages the geometry of the data to extract subsets of the training examples for efficient machine learning. The key idea behind our method is to dynamically approximate the curvature of the loss function via an exponentially-averaged estimate of the Hessian to select weighted subsets (coresets) that provide a close approximation of the full gradient preconditioned with the Hessian. We prove rigorous guarantees for the convergence of various first and second-order methods applied to the subsets chosen by AdaCore. Our extensive experiments show that AdaCore extracts coresets with higher quality compared to baselines and speeds up training of convex and non-convex machine learning models, such as logistic regression and neural networks, by over 2.9x over the full data and 4.5x over random subsets.}\n}", "pdf": "https://proceedings.mlr.press/v162/pooladzandi22a/pooladzandi22a.pdf", "supp": "", "pdf_size": 1519187, "gs_citation": 73, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=724710289135309295&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Electrical & Computer Engineering, University of California, Los Angeles, USA+Department of Computer Science, University of California, Los Angeles, USA; Department of Computer Science, University of California, Los Angeles, USA; Department of Computer Science, University of California, Los Angeles, USA", "aff_domain": "ucla.edu; ; ", "email": "ucla.edu; ; ", "github": "https://github.com/opooladz/AdaCore", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/pooladzandi22a.html", "aff_unique_index": "0+0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "Department of Electrical & Computer Engineering", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0+0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "Additive Gaussian Processes Revisited", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16273", "id": "16273", "proceeding": "https://proceedings.mlr.press/v162/lu22b.html", "poster": "/media/PosterPDFs/ICML%202022/f41ff84e7cbd129397c11f8c5d20c0f4_aWblIQJ.png?t=1657528605.7165487", "slides": "", "author_site": "Xiaoyu Lu, Alexis Boukouvalas, James Hensman", "author": "Xiaoyu Lu; Alexis Boukouvalas; James Hensman", "abstract": "Gaussian Process (GP) models are a class of flexible non-parametric models that have rich representational power. By using a Gaussian process with additive structure, complex responses can be modelled whilst retaining interpretability. Previous work showed that additive Gaussian process models require high-dimensional interaction terms. We propose the orthogonal additive kernel (OAK), which imposes an orthogonality constraint on the additive functions, enabling an identifiable, low-dimensional representation of the functional relationship. We connect the OAK kernel to functional ANOVA decomposition, and show improved convergence rates for sparse computation methods. With only a small number of additive low-dimensional terms, we demonstrate the OAK model achieves similar or better predictive performance compared to black-box models, while retaining interpretability.", "bibtex": "@InProceedings{pmlr-v162-lu22b,\n title = \t {Additive {G}aussian Processes Revisited},\n author = {Lu, Xiaoyu and Boukouvalas, Alexis and Hensman, James},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14358--14383},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lu22b/lu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/lu22b.html},\n abstract = \t {Gaussian Process (GP) models are a class of flexible non-parametric models that have rich representational power. By using a Gaussian process with additive structure, complex responses can be modelled whilst retaining interpretability. Previous work showed that additive Gaussian process models require high-dimensional interaction terms. We propose the orthogonal additive kernel (OAK), which imposes an orthogonality constraint on the additive functions, enabling an identifiable, low-dimensional representation of the functional relationship. We connect the OAK kernel to functional ANOVA decomposition, and show improved convergence rates for sparse computation methods. With only a small number of additive low-dimensional terms, we demonstrate the OAK model achieves similar or better predictive performance compared to black-box models, while retaining interpretability.}\n}", "pdf": "https://proceedings.mlr.press/v162/lu22b/lu22b.pdf", "supp": "", "pdf_size": 708077, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6171646250259596364&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Amazon, Cambridge, United Kingdom; Amazon, Cambridge, United Kingdom; Amazon, Cambridge, United Kingdom", "aff_domain": "amazon.com; ; ", "email": "amazon.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/lu22b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Amazon", "aff_unique_dep": "Amazon", "aff_unique_url": "https://www.amazon.com", "aff_unique_abbr": "", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Addressing Optimism Bias in Sequence Modeling for Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16929", "id": "16929", "proceeding": "https://proceedings.mlr.press/v162/villaflor22a.html", "poster": "", "slides": "", "author_site": "Adam Villaflor, Zhe Huang, Swapnil Pande, John Dolan, Jeff Schneider", "author": "Adam R Villaflor; Zhe Huang; Swapnil Pande; John M Dolan; Jeff Schneider", "abstract": "Impressive results in natural language processing (NLP) based on the Transformer neural network architecture have inspired researchers to explore viewing offline reinforcement learning (RL) as a generic sequence modeling problem. Recent works based on this paradigm have achieved state-of-the-art results in several of the mostly deterministic offline Atari and D4RL benchmarks. However, because these methods jointly model the states and actions as a single sequencing problem, they struggle to disentangle the effects of the policy and world dynamics on the return. Thus, in adversarial or stochastic environments, these methods lead to overly optimistic behavior that can be dangerous in safety-critical systems like autonomous driving. In this work, we propose a method that addresses this optimism bias by explicitly disentangling the policy and world models, which allows us at test time to search for policies that are robust to multiple possible futures in the environment. We demonstrate our method\u2019s superior performance on a variety of autonomous driving tasks in simulation.", "bibtex": "@InProceedings{pmlr-v162-villaflor22a,\n title = \t {Addressing Optimism Bias in Sequence Modeling for Reinforcement Learning},\n author = {Villaflor, Adam R and Huang, Zhe and Pande, Swapnil and Dolan, John M and Schneider, Jeff},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22270--22283},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/villaflor22a/villaflor22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/villaflor22a.html},\n abstract = \t {Impressive results in natural language processing (NLP) based on the Transformer neural network architecture have inspired researchers to explore viewing offline reinforcement learning (RL) as a generic sequence modeling problem. Recent works based on this paradigm have achieved state-of-the-art results in several of the mostly deterministic offline Atari and D4RL benchmarks. However, because these methods jointly model the states and actions as a single sequencing problem, they struggle to disentangle the effects of the policy and world dynamics on the return. Thus, in adversarial or stochastic environments, these methods lead to overly optimistic behavior that can be dangerous in safety-critical systems like autonomous driving. In this work, we propose a method that addresses this optimism bias by explicitly disentangling the policy and world models, which allows us at test time to search for policies that are robust to multiple possible futures in the environment. We demonstrate our method\u2019s superior performance on a variety of autonomous driving tasks in simulation.}\n}", "pdf": "https://proceedings.mlr.press/v162/villaflor22a/villaflor22a.pdf", "supp": "", "pdf_size": 937564, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=156431622636083247&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Carnegie Mellon University; Carnegie Mellon University; Carnegie Mellon University; Carnegie Mellon University; Carnegie Mellon University", "aff_domain": "cmu.edu; ; ; ;cs.cmu.edu", "email": "cmu.edu; ; ; ;cs.cmu.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/villaflor22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Adversarial Attack and Defense for Non-Parametric Two-Sample Tests", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17057", "id": "17057", "proceeding": "https://proceedings.mlr.press/v162/xu22m.html", "poster": "/media/PosterPDFs/ICML%202022/aba3b6fd5d186d28e06ff97135cade7f.png?t=1657264555.9033003", "slides": "", "author_site": "Xilie Xu, Jingfeng Zhang, Feng Liu, Masashi Sugiyama, Mohan Kankanhalli", "author": "Xilie Xu; Jingfeng Zhang; Feng Liu; Masashi Sugiyama; Mohan Kankanhalli", "abstract": "Non-parametric two-sample tests (TSTs) that judge whether two sets of samples are drawn from the same distribution, have been widely used in the analysis of critical data. People tend to employ TSTs as trusted basic tools and rarely have any doubt about their reliability. This paper systematically uncovers the failure mode of non-parametric TSTs through adversarial attacks and then proposes corresponding defense strategies. First, we theoretically show that an adversary can upper-bound the distributional shift which guarantees the attack\u2019s invisibility. Furthermore, we theoretically find that the adversary can also degrade the lower bound of a TST\u2019s test power, which enables us to iteratively minimize the test criterion in order to search for adversarial pairs. To enable TST-agnostic attacks, we propose an ensemble attack (EA) framework that jointly minimizes the different types of test criteria. Second, to robustify TSTs, we propose a max-min optimization that iteratively generates adversarial pairs to train the deep kernels. Extensive experiments on both simulated and real-world datasets validate the adversarial vulnerabilities of non-parametric TSTs and the effectiveness of our proposed defense. Source code is available at https://github.com/GodXuxilie/Robust-TST.git.", "bibtex": "@InProceedings{pmlr-v162-xu22m,\n title = \t {Adversarial Attack and Defense for Non-Parametric Two-Sample Tests},\n author = {Xu, Xilie and Zhang, Jingfeng and Liu, Feng and Sugiyama, Masashi and Kankanhalli, Mohan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24743--24769},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22m/xu22m.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22m.html},\n abstract = \t {Non-parametric two-sample tests (TSTs) that judge whether two sets of samples are drawn from the same distribution, have been widely used in the analysis of critical data. People tend to employ TSTs as trusted basic tools and rarely have any doubt about their reliability. This paper systematically uncovers the failure mode of non-parametric TSTs through adversarial attacks and then proposes corresponding defense strategies. First, we theoretically show that an adversary can upper-bound the distributional shift which guarantees the attack\u2019s invisibility. Furthermore, we theoretically find that the adversary can also degrade the lower bound of a TST\u2019s test power, which enables us to iteratively minimize the test criterion in order to search for adversarial pairs. To enable TST-agnostic attacks, we propose an ensemble attack (EA) framework that jointly minimizes the different types of test criteria. Second, to robustify TSTs, we propose a max-min optimization that iteratively generates adversarial pairs to train the deep kernels. Extensive experiments on both simulated and real-world datasets validate the adversarial vulnerabilities of non-parametric TSTs and the effectiveness of our proposed defense. Source code is available at https://github.com/GodXuxilie/Robust-TST.git.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22m/xu22m.pdf", "supp": "", "pdf_size": 2261976, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16006347209208499674&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "School of Computing, National University of Singapore; RIKEN Center for Advanced Intelligence Project (AIP); School of Mathematics and Statistics, The University of Melbourne; Graduate School of Frontier Sciences, The University of Tokyo; School of Computing, National University of Singapore", "aff_domain": "comp.nus.edu.sg;riken.jp;unimelb.edu.au;k.u-tokyo.ac.jp;comp.nus.edu.sg", "email": "comp.nus.edu.sg;riken.jp;unimelb.edu.au;k.u-tokyo.ac.jp;comp.nus.edu.sg", "github": "https://github.com/GodXuxilie/Robust-TST.git", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/xu22m.html", "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "National University of Singapore;RIKEN;University of Melbourne;University of Tokyo", "aff_unique_dep": "School of Computing;Center for Advanced Intelligence Project (AIP);School of Mathematics and Statistics;Graduate School of Frontier Sciences", "aff_unique_url": "https://www.nus.edu.sg;https://www.riken.jp/en/;https://www.unimelb.edu.au;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "NUS;RIKEN;UniMelb;UTokyo", "aff_campus_unique_index": "0;2;3;0", "aff_campus_unique": "Singapore;;Melbourne;Tokyo", "aff_country_unique_index": "0;1;2;1;0", "aff_country_unique": "Singapore;Japan;Australia" }, { "title": "Adversarial Attacks on Gaussian Process Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16265", "id": "16265", "proceeding": "https://proceedings.mlr.press/v162/han22f.html", "poster": "/media/PosterPDFs/ICML%202022/c24cd76e1ce41366a4bbe8a49b02a028.png?t=1657008165.4227676", "slides": "", "author_site": "Eric Han, Jonathan Scarlett", "author": "Eric Han; Jonathan Scarlett", "abstract": "Gaussian processes (GP) are a widely-adopted tool used to sequentially optimize black-box functions, where evaluations are costly and potentially noisy. Recent works on GP bandits have proposed to move beyond random noise and devise algorithms robust to adversarial attacks. This paper studies this problem from the attacker\u2019s perspective, proposing various adversarial attack methods with differing assumptions on the attacker\u2019s strength and prior information. Our goal is to understand adversarial attacks on GP bandits from theoretical and practical perspectives. We focus primarily on targeted attacks on the popular GP-UCB algorithm and a related elimination-based algorithm, based on adversarially perturbing the function f to produce another function f\u00a0 whose optima are in some target region. Based on our theoretical analysis, we devise both white-box attacks (known f) and black-box attacks (unknown f), with the former including a Subtraction attack and Clipping attack, and the latter including an Aggressive subtraction attack. We demonstrate that adversarial attacks on GP bandits can succeed in forcing the algorithm towards the target region even with a low attack budget, and we test our attacks\u2019 effectiveness on a diverse range of objective functions.", "bibtex": "@InProceedings{pmlr-v162-han22f,\n title = \t {Adversarial Attacks on {G}aussian Process Bandits},\n author = {Han, Eric and Scarlett, Jonathan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8304--8329},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/han22f/han22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/han22f.html},\n abstract = \t {Gaussian processes (GP) are a widely-adopted tool used to sequentially optimize black-box functions, where evaluations are costly and potentially noisy. Recent works on GP bandits have proposed to move beyond random noise and devise algorithms robust to adversarial attacks. This paper studies this problem from the attacker\u2019s perspective, proposing various adversarial attack methods with differing assumptions on the attacker\u2019s strength and prior information. Our goal is to understand adversarial attacks on GP bandits from theoretical and practical perspectives. We focus primarily on targeted attacks on the popular GP-UCB algorithm and a related elimination-based algorithm, based on adversarially perturbing the function f to produce another function f\u00a0 whose optima are in some target region. Based on our theoretical analysis, we devise both white-box attacks (known f) and black-box attacks (unknown f), with the former including a Subtraction attack and Clipping attack, and the latter including an Aggressive subtraction attack. We demonstrate that adversarial attacks on GP bandits can succeed in forcing the algorithm towards the target region even with a low attack budget, and we test our attacks\u2019 effectiveness on a diverse range of objective functions.}\n}", "pdf": "https://proceedings.mlr.press/v162/han22f/han22f.pdf", "supp": "", "pdf_size": 6072380, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13292319437654740768&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "School of Computing, National University of Singapore; Department of Mathematics & Institute of Data Science, National University of Singapore", "aff_domain": "nus.edu.sg;comp.nus.edu.sg", "email": "nus.edu.sg;comp.nus.edu.sg", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/han22f.html", "aff_unique_index": "0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "School of Computing", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "title": "Adversarial Masking for Self-Supervised Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16267", "id": "16267", "proceeding": "https://proceedings.mlr.press/v162/shi22d.html", "poster": "/media/PosterPDFs/ICML%202022/352fe25daf686bdb4edca223c921acea_x8BCLGq.png?t=1657621369.7728877", "slides": "", "author_site": "Yuge Shi, Siddharth N, Phil Torr, Adam Kosiorek", "author": "Yuge Shi; N Siddharth; Philip Torr; Adam R Kosiorek", "abstract": "We propose ADIOS, a masked image model (MIM) framework for self-supervised learning, which simultaneously learns a masking function and an image encoder using an adversarial objective. The image encoder is trained to minimise the distance between representations of the original and that of a masked image. The masking function, conversely, aims at maximising this distance. ADIOS consistently improves on state-of-the-art self-supervised learning (SSL) methods on a variety of tasks and datasets\u2014including classification on ImageNet100 and STL10, transfer learning on CIFAR10/100, Flowers102 and iNaturalist, as well as robustness evaluated on the backgrounds challenge (Xiao et al., 2021)\u2014while generating semantically meaningful masks. Unlike modern MIM models such as MAE, BEiT and iBOT, ADIOS does not rely on the image-patch tokenisation construction of Vision Transformers, and can be implemented with convolutional backbones. We further demonstrate that the masks learned by ADIOS are more effective in improving representation learning of SSL methods than masking schemes used in popular MIM models.", "bibtex": "@InProceedings{pmlr-v162-shi22d,\n title = \t {Adversarial Masking for Self-Supervised Learning},\n author = {Shi, Yuge and Siddharth, N and Torr, Philip and Kosiorek, Adam R},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20026--20040},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shi22d/shi22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/shi22d.html},\n abstract = \t {We propose ADIOS, a masked image model (MIM) framework for self-supervised learning, which simultaneously learns a masking function and an image encoder using an adversarial objective. The image encoder is trained to minimise the distance between representations of the original and that of a masked image. The masking function, conversely, aims at maximising this distance. ADIOS consistently improves on state-of-the-art self-supervised learning (SSL) methods on a variety of tasks and datasets\u2014including classification on ImageNet100 and STL10, transfer learning on CIFAR10/100, Flowers102 and iNaturalist, as well as robustness evaluated on the backgrounds challenge (Xiao et al., 2021)\u2014while generating semantically meaningful masks. Unlike modern MIM models such as MAE, BEiT and iBOT, ADIOS does not rely on the image-patch tokenisation construction of Vision Transformers, and can be implemented with convolutional backbones. We further demonstrate that the masks learned by ADIOS are more effective in improving representation learning of SSL methods than masking schemes used in popular MIM models.}\n}", "pdf": "https://proceedings.mlr.press/v162/shi22d/shi22d.pdf", "supp": "", "pdf_size": 2706988, "gs_citation": 115, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3881185449721325576&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of Oxford; The University of Edinburgh & The Alan Turing Institute; University of Oxford; DeepMind", "aff_domain": "robots.ox.ac.uk; ; ;deepmind.com", "email": "robots.ox.ac.uk; ; ;deepmind.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/shi22d.html", "aff_unique_index": "0;1;0;2", "aff_unique_norm": "University of Oxford;University of Edinburgh;DeepMind", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ox.ac.uk;https://www.ed.ac.uk;https://deepmind.com", "aff_unique_abbr": "Oxford;Edinburgh;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Adversarial Robustness against Multiple and Single $l_p$-Threat Models via Quick Fine-Tuning of Robust Classifiers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16203", "id": "16203", "proceeding": "https://proceedings.mlr.press/v162/croce22b.html", "poster": "/media/PosterPDFs/ICML%202022/14eac0d254a6ccaf9b67584c7830a5c0.png?t=1657664632.0960655", "slides": "", "author_site": "Francesco Croce, Matthias Hein", "author": "Francesco Croce; Matthias Hein", "abstract": "A major drawback of adversarially robust models, in particular for large scale datasets like ImageNet, is the extremely long training time compared to standard models. Moreover, models should be robust not only to one $l_p$-threat model but ideally to all of them. In this paper we propose Extreme norm Adversarial Training (E-AT) for multiple-norm robustness which is based on geometric properties of $l_p$-balls. E-AT costs up to three times less than other adversarial training methods for multiple-norm robustness. Using E-AT we show that for ImageNet a single epoch and for CIFAR-10 three epochs are sufficient to turn any $l_p$-robust model into a multiple-norm robust model. In this way we get the first multiple-norm robust model for ImageNet and boost the state-of-the-art for multiple-norm robustness to more than $51%$ on CIFAR-10. Finally, we study the general transfer via fine-tuning of adversarial robustness between different individual $l_p$-threat models and improve the previous SOTA $l_1$-robustness on both CIFAR-10 and ImageNet. Extensive experiments show that our scheme works across datasets and architectures including vision transformers.", "bibtex": "@InProceedings{pmlr-v162-croce22b,\n title = \t {Adversarial Robustness against Multiple and Single $l_p$-Threat Models via Quick Fine-Tuning of Robust Classifiers},\n author = {Croce, Francesco and Hein, Matthias},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4436--4454},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/croce22b/croce22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/croce22b.html},\n abstract = \t {A major drawback of adversarially robust models, in particular for large scale datasets like ImageNet, is the extremely long training time compared to standard models. Moreover, models should be robust not only to one $l_p$-threat model but ideally to all of them. In this paper we propose Extreme norm Adversarial Training (E-AT) for multiple-norm robustness which is based on geometric properties of $l_p$-balls. E-AT costs up to three times less than other adversarial training methods for multiple-norm robustness. Using E-AT we show that for ImageNet a single epoch and for CIFAR-10 three epochs are sufficient to turn any $l_p$-robust model into a multiple-norm robust model. In this way we get the first multiple-norm robust model for ImageNet and boost the state-of-the-art for multiple-norm robustness to more than $51%$ on CIFAR-10. Finally, we study the general transfer via fine-tuning of adversarial robustness between different individual $l_p$-threat models and improve the previous SOTA $l_1$-robustness on both CIFAR-10 and ImageNet. Extensive experiments show that our scheme works across datasets and architectures including vision transformers.}\n}", "pdf": "https://proceedings.mlr.press/v162/croce22b/croce22b.pdf", "supp": "", "pdf_size": 564048, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14798100310510930510&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "University of T\u00fcbingen, Germany; University of T\u00fcbingen, Germany", "aff_domain": "uni-tuebingen.de; ", "email": "uni-tuebingen.de; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/croce22b.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of T\u00fcbingen", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Adversarial Vulnerability of Randomized Ensembles", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16509", "id": "16509", "proceeding": "https://proceedings.mlr.press/v162/dbouk22a.html", "poster": "/media/PosterPDFs/ICML%202022/62db9e3397c76207a687c360e0243317.png?t=1656712175.9779105", "slides": "/media/icml-2022/Slides/16509.pdf", "author_site": "Hassan Dbouk, Naresh Shanbhag", "author": "Hassan Dbouk; Naresh Shanbhag", "abstract": "Despite the tremendous success of deep neural networks across various tasks, their vulnerability to imperceptible adversarial perturbations has hindered their deployment in the real world. Recently, works on randomized ensembles have empirically demonstrated significant improvements in adversarial robustness over standard adversarially trained (AT) models with minimal computational overhead, making them a promising solution for safety-critical resource-constrained applications. However, this impressive performance raises the question: Are these robustness gains provided by randomized ensembles real? In this work we address this question both theoretically and empirically. We first establish theoretically that commonly employed robustness evaluation methods such as adaptive PGD provide a false sense of security in this setting. Subsequently, we propose a theoretically-sound and efficient adversarial attack algorithm (ARC) capable of compromising random ensembles even in cases where adaptive PGD fails to do so. We conduct comprehensive experiments across a variety of network architectures, training schemes, datasets, and norms to support our claims, and empirically establish that randomized ensembles are in fact more vulnerable to $\\ell_p$-bounded adversarial perturbations than even standard AT models. Our code can be found at https://github.com/hsndbk4/ARC.", "bibtex": "@InProceedings{pmlr-v162-dbouk22a,\n title = \t {Adversarial Vulnerability of Randomized Ensembles},\n author = {Dbouk, Hassan and Shanbhag, Naresh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4890--4917},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dbouk22a/dbouk22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dbouk22a.html},\n abstract = \t {Despite the tremendous success of deep neural networks across various tasks, their vulnerability to imperceptible adversarial perturbations has hindered their deployment in the real world. Recently, works on randomized ensembles have empirically demonstrated significant improvements in adversarial robustness over standard adversarially trained (AT) models with minimal computational overhead, making them a promising solution for safety-critical resource-constrained applications. However, this impressive performance raises the question: Are these robustness gains provided by randomized ensembles real? In this work we address this question both theoretically and empirically. We first establish theoretically that commonly employed robustness evaluation methods such as adaptive PGD provide a false sense of security in this setting. Subsequently, we propose a theoretically-sound and efficient adversarial attack algorithm (ARC) capable of compromising random ensembles even in cases where adaptive PGD fails to do so. We conduct comprehensive experiments across a variety of network architectures, training schemes, datasets, and norms to support our claims, and empirically establish that randomized ensembles are in fact more vulnerable to $\\ell_p$-bounded adversarial perturbations than even standard AT models. Our code can be found at https://github.com/hsndbk4/ARC.}\n}", "pdf": "https://proceedings.mlr.press/v162/dbouk22a/dbouk22a.pdf", "supp": "", "pdf_size": 2009381, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2408757977511355426&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "aff": "Department of Electrical and Computer Engineering, University of Illinois at Urbana-Champaign, Urbana, USA; Department of Electrical and Computer Engineering, University of Illinois at Urbana-Champaign, Urbana, USA", "aff_domain": "illinois.edu; ", "email": "illinois.edu; ", "github": "https://github.com/hsndbk4/ARC", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/dbouk22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Adversarially Robust Models may not Transfer Better: Sufficient Conditions for Domain Transferability from the View of Regularization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18193", "id": "18193", "proceeding": "https://proceedings.mlr.press/v162/xu22n.html", "poster": "/media/PosterPDFs/ICML%202022/f89394c979b34a25cc4ff8e11234fbfb.png?t=1657733186.5734258", "slides": "", "author_site": "Xiaojun Xu, Yibo Zhang, Evelyn Ma, Hyun Ho Son, Sanmi Koyejo, Bo Li", "author": "Xiaojun Xu; Jacky Y Zhang; Evelyn Ma; Hyun Ho Son; Sanmi Koyejo; Bo Li", "abstract": "Machine learning (ML) robustness and domain generalization are fundamentally correlated: they essentially concern data distribution shifts under adversarial and natural settings, respectively. On one hand, recent studies show that more robust (adversarially trained) models are more generalizable. On the other hand, there is a lack of theoretical understanding of their fundamental connections. In this paper, we explore the relationship between regularization and domain transferability considering different factors such as norm regularization and data augmentations (DA). We propose a general theoretical framework proving that factors involving the model function class regularization are sufficient conditions for relative domain transferability. Our analysis implies that \u201crobustness\" is neither necessary nor sufficient for transferability; rather, regularization is a more fundamental perspective for understanding domain transferability. We then discuss popular DA protocols (including adversarial training) and show when they can be viewed as the function class regularization under certain conditions and therefore improve generalization. We conduct extensive experiments to verify our theoretical findings and show several counterexamples where robustness and generalization are negatively correlated on different datasets.", "bibtex": "@InProceedings{pmlr-v162-xu22n,\n title = \t {Adversarially Robust Models may not Transfer Better: Sufficient Conditions for Domain Transferability from the View of Regularization},\n author = {Xu, Xiaojun and Zhang, Jacky Y and Ma, Evelyn and Son, Hyun Ho and Koyejo, Sanmi and Li, Bo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24770--24802},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22n/xu22n.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22n.html},\n abstract = \t {Machine learning (ML) robustness and domain generalization are fundamentally correlated: they essentially concern data distribution shifts under adversarial and natural settings, respectively. On one hand, recent studies show that more robust (adversarially trained) models are more generalizable. On the other hand, there is a lack of theoretical understanding of their fundamental connections. In this paper, we explore the relationship between regularization and domain transferability considering different factors such as norm regularization and data augmentations (DA). We propose a general theoretical framework proving that factors involving the model function class regularization are sufficient conditions for relative domain transferability. Our analysis implies that \u201crobustness\" is neither necessary nor sufficient for transferability; rather, regularization is a more fundamental perspective for understanding domain transferability. We then discuss popular DA protocols (including adversarial training) and show when they can be viewed as the function class regularization under certain conditions and therefore improve generalization. We conduct extensive experiments to verify our theoretical findings and show several counterexamples where robustness and generalization are negatively correlated on different datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22n/xu22n.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/xu22n-supp.zip", "pdf_size": 1588032, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15666112236501144638&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign", "aff_domain": "illinois.edu;illinois.edu; ; ;illinois.edu;illinois.edu", "email": "illinois.edu;illinois.edu; ; ;illinois.edu;illinois.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/xu22n.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Adversarially Trained Actor Critic for Offline Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17173", "id": "17173", "proceeding": "https://proceedings.mlr.press/v162/cheng22b.html", "poster": "/media/PosterPDFs/ICML%202022/69dd2eff9b6a421d5ce262b093bdab23.png?t=1658202101.8416028", "slides": "", "author_site": "Ching-An Cheng, Tengyang Xie, Nan Jiang, Alekh Agarwal", "author": "Ching-An Cheng; Tengyang Xie; Nan Jiang; Alekh Agarwal", "abstract": "We propose Adversarially Trained Actor Critic (ATAC), a new model-free algorithm for offline reinforcement learning (RL) under insufficient data coverage, based on the concept of relative pessimism. ATAC is designed as a two-player Stackelberg game framing of offline RL: A policy actor competes against an adversarially trained value critic, who finds data-consistent scenarios where the actor is inferior to the data-collection behavior policy. We prove that, when the actor attains no regret in the two-player game, running ATAC produces a policy that provably 1) outperforms the behavior policy over a wide range of hyperparameters that control the degree of pessimism, and 2) competes with the best policy covered by data with appropriately chosen hyperparameters. Compared with existing works, notably our framework offers both theoretical guarantees for general function approximation and a deep RL implementation scalable to complex environments and large datasets. In the D4RL benchmark, ATAC consistently outperforms state-of-the-art offline RL algorithms on a range of continuous control tasks.", "bibtex": "@InProceedings{pmlr-v162-cheng22b,\n title = \t {Adversarially Trained Actor Critic for Offline Reinforcement Learning},\n author = {Cheng, Ching-An and Xie, Tengyang and Jiang, Nan and Agarwal, Alekh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3852--3878},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cheng22b/cheng22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/cheng22b.html},\n abstract = \t {We propose Adversarially Trained Actor Critic (ATAC), a new model-free algorithm for offline reinforcement learning (RL) under insufficient data coverage, based on the concept of relative pessimism. ATAC is designed as a two-player Stackelberg game framing of offline RL: A policy actor competes against an adversarially trained value critic, who finds data-consistent scenarios where the actor is inferior to the data-collection behavior policy. We prove that, when the actor attains no regret in the two-player game, running ATAC produces a policy that provably 1) outperforms the behavior policy over a wide range of hyperparameters that control the degree of pessimism, and 2) competes with the best policy covered by data with appropriately chosen hyperparameters. Compared with existing works, notably our framework offers both theoretical guarantees for general function approximation and a deep RL implementation scalable to complex environments and large datasets. In the D4RL benchmark, ATAC consistently outperforms state-of-the-art offline RL algorithms on a range of continuous control tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/cheng22b/cheng22b.pdf", "supp": "", "pdf_size": 4779678, "gs_citation": 160, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8385322441763797566&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Microsoft Research; University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign; Google Research", "aff_domain": "microsoft.com; ; ; ", "email": "microsoft.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/cheng22b.html", "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Microsoft;University of Illinois Urbana-Champaign;Google", "aff_unique_dep": "Microsoft Research;;Google Research", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://illinois.edu;https://research.google", "aff_unique_abbr": "MSR;UIUC;Google Research", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Urbana-Champaign;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Adversarially trained neural representations are already as robust as biological neural representations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17123", "id": "17123", "proceeding": "https://proceedings.mlr.press/v162/guo22d.html", "poster": "", "slides": "", "author_site": "Chong Guo, Michael Lee, Guillaume Leclerc, Joel Dapello, Yug Rao, Aleksander Madry, James DiCarlo", "author": "Chong Guo; Michael Lee; Guillaume Leclerc; Joel Dapello; Yug Rao; Aleksander Madry; James Dicarlo", "abstract": "Visual systems of primates are the gold standard of robust perception. There is thus a general belief that mimicking the neural representations that underlie those systems will yield artificial visual systems that are adversarially robust. In this work, we develop a method for performing adversarial visual attacks directly on primate brain activity. We then leverage this method to demonstrate that the above-mentioned belief might not be well-founded. Specifically, we report that the biological neurons that make up visual systems of primates exhibit susceptibility to adversarial perturbations that is comparable in magnitude to existing (robustly trained) artificial neural networks.", "bibtex": "@InProceedings{pmlr-v162-guo22d,\n title = \t {Adversarially trained neural representations are already as robust as biological neural representations},\n author = {Guo, Chong and Lee, Michael and Leclerc, Guillaume and Dapello, Joel and Rao, Yug and Madry, Aleksander and Dicarlo, James},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8072--8081},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guo22d/guo22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/guo22d.html},\n abstract = \t {Visual systems of primates are the gold standard of robust perception. There is thus a general belief that mimicking the neural representations that underlie those systems will yield artificial visual systems that are adversarially robust. In this work, we develop a method for performing adversarial visual attacks directly on primate brain activity. We then leverage this method to demonstrate that the above-mentioned belief might not be well-founded. Specifically, we report that the biological neurons that make up visual systems of primates exhibit susceptibility to adversarial perturbations that is comparable in magnitude to existing (robustly trained) artificial neural networks.}\n}", "pdf": "https://proceedings.mlr.press/v162/guo22d/guo22d.pdf", "supp": "", "pdf_size": 2056410, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10514557472913967382&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff": "McGovern Institute for Brain Research, MIT+Department of Brain and Cognitive Sciences, MIT+Center for Brains, Minds and Machines, MIT; McGovern Institute for Brain Research, MIT+Department of Brain and Cognitive Sciences, MIT+Center for Brains, Minds and Machines, MIT+Computer Science and Artificial Intelligence Laboratory, MIT+School of Engineering and Applied Sciences, Harvard University; Computer Science and Artificial Intelligence Laboratory, MIT; McGovern Institute for Brain Research, MIT+Department of Brain and Cognitive Sciences, MIT+Center for Brains, Minds and Machines, MIT+School of Engineering and Applied Sciences, Harvard University; Purdue University; Computer Science and Artificial Intelligence Laboratory, MIT+Department of Electrical Engineering and Computer Science, MIT; McGovern Institute for Brain Research, MIT+Department of Brain and Cognitive Sciences, MIT+Center for Brains, Minds and Machines, MIT", "aff_domain": "mit.edu; ; ; ; ; ; ", "email": "mit.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/guo22d.html", "aff_unique_index": "0+0+0;0+0+0+0+1;0;0+0+0+1;2;0+0;0+0+0", "aff_unique_norm": "Massachusetts Institute of Technology;Harvard University;Purdue University", "aff_unique_dep": "McGovern Institute for Brain Research;School of Engineering and Applied Sciences;", "aff_unique_url": "https://www.mit.edu;https://www.harvard.edu;https://www.purdue.edu", "aff_unique_abbr": "MIT;Harvard;Purdue", "aff_campus_unique_index": "0+0+0;0+0+0+0+0;0;0+0+0+0;0+0;0+0+0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0+0+0;0+0+0+0+0;0;0+0+0+0;0;0+0;0+0+0", "aff_country_unique": "United States" }, { "title": "Agnostic Learnability of Halfspaces via Logistic Loss", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16453", "id": "16453", "proceeding": "https://proceedings.mlr.press/v162/ji22a.html", "poster": "/media/PosterPDFs/ICML%202022/da87cbc1b5b8501acf3b49eec1cc52c3.png?t=1657551840.7281039", "slides": "", "author_site": "Ziwei Ji, Kwangjun Ahn, Pranjal Awasthi, Satyen Kale, Stefani Karp", "author": "Ziwei Ji; Kwangjun Ahn; Pranjal Awasthi; Satyen Kale; Stefani Karp", "abstract": "We investigate approximation guarantees provided by logistic regression for the fundamental problem of agnostic learning of homogeneous halfspaces. Previously, for a certain broad class of \u201cwell-behaved\u201d distributions on the examples, Diakonikolas et al. (2020) proved an tilde{Omega}(OPT) lower bound, while Frei et al. (2021) proved an tilde{O}(sqrt{OPT}) upper bound, where OPT denotes the best zero-one/misclassification risk of a homogeneous halfspace. In this paper, we close this gap by constructing a well-behaved distribution such that the global minimizer of the logistic risk over this distribution only achieves Omega(sqrt{OPT}) misclassification risk, matching the upper bound in (Frei et al., 2021). On the other hand, we also show that if we impose a radial-Lipschitzness condition in addition to well-behaved-ness on the distribution, logistic regression on a ball of bounded radius reaches tilde{O}(OPT) misclassification risk. Our techniques also show for any well-behaved distribution, regardless of radial Lipschitzness, we can overcome the Omega(sqrt{OPT}) lower bound for logistic loss simply at the cost of one additional convex optimization step involving the hinge loss and attain tilde{O}(OPT) misclassification risk. This two-step convex optimization algorithm is simpler than previous methods obtaining this guarantee, all of which require solving O(log(1/OPT)) minimization problems.", "bibtex": "@InProceedings{pmlr-v162-ji22a,\n title = \t {Agnostic Learnability of Halfspaces via Logistic Loss},\n author = {Ji, Ziwei and Ahn, Kwangjun and Awasthi, Pranjal and Kale, Satyen and Karp, Stefani},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10068--10103},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ji22a/ji22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ji22a.html},\n abstract = \t {We investigate approximation guarantees provided by logistic regression for the fundamental problem of agnostic learning of homogeneous halfspaces. Previously, for a certain broad class of \u201cwell-behaved\u201d distributions on the examples, Diakonikolas et al. (2020) proved an tilde{Omega}(OPT) lower bound, while Frei et al. (2021) proved an tilde{O}(sqrt{OPT}) upper bound, where OPT denotes the best zero-one/misclassification risk of a homogeneous halfspace. In this paper, we close this gap by constructing a well-behaved distribution such that the global minimizer of the logistic risk over this distribution only achieves Omega(sqrt{OPT}) misclassification risk, matching the upper bound in (Frei et al., 2021). On the other hand, we also show that if we impose a radial-Lipschitzness condition in addition to well-behaved-ness on the distribution, logistic regression on a ball of bounded radius reaches tilde{O}(OPT) misclassification risk. Our techniques also show for any well-behaved distribution, regardless of radial Lipschitzness, we can overcome the Omega(sqrt{OPT}) lower bound for logistic loss simply at the cost of one additional convex optimization step involving the hinge loss and attain tilde{O}(OPT) misclassification risk. This two-step convex optimization algorithm is simpler than previous methods obtaining this guarantee, all of which require solving O(log(1/OPT)) minimization problems.}\n}", "pdf": "https://proceedings.mlr.press/v162/ji22a/ji22a.pdf", "supp": "", "pdf_size": 446108, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15437642273872154502&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/ji22a.html" }, { "title": "Algorithms for the Communication of Samples", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16207", "id": "16207", "proceeding": "https://proceedings.mlr.press/v162/theis22a.html", "poster": "/media/PosterPDFs/ICML%202022/e4dd5528f7596dcdf871aa55cfccc53c.png?t=1657276110.808915", "slides": "", "author_site": "Lucas Theis, Nour Ahmed", "author": "Lucas Theis; Noureldin Y Ahmed", "abstract": "The efficient communication of noisy data has applications in several areas of machine learning, such as neural compression or differential privacy, and is also known as reverse channel coding or the channel simulation problem. Here we propose two new coding schemes with practical advantages over existing approaches. First, we introduce ordered random coding (ORC) which uses a simple trick to reduce the coding cost of previous approaches. This scheme further illuminates a connection between schemes based on importance sampling and the so-called Poisson functional representation. Second, we describe a hybrid coding scheme which uses dithered quantization to more efficiently communicate samples from distributions with bounded support.", "bibtex": "@InProceedings{pmlr-v162-theis22a,\n title = \t {Algorithms for the Communication of Samples},\n author = {Theis, Lucas and Ahmed, Noureldin Y},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21308--21328},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/theis22a/theis22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/theis22a.html},\n abstract = \t {The efficient communication of noisy data has applications in several areas of machine learning, such as neural compression or differential privacy, and is also known as reverse channel coding or the channel simulation problem. Here we propose two new coding schemes with practical advantages over existing approaches. First, we introduce ordered random coding (ORC) which uses a simple trick to reduce the coding cost of previous approaches. This scheme further illuminates a connection between schemes based on importance sampling and the so-called Poisson functional representation. Second, we describe a hybrid coding scheme which uses dithered quantization to more efficiently communicate samples from distributions with bounded support.}\n}", "pdf": "https://proceedings.mlr.press/v162/theis22a/theis22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/theis22a-supp.zip", "pdf_size": 493878, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12531743848939504299&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Google, London, UK; Google, Dublin, Ireland", "aff_domain": "google.com; ", "email": "google.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/theis22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;1", "aff_campus_unique": "London;Dublin", "aff_country_unique_index": "0;1", "aff_country_unique": "United Kingdom;Ireland" }, { "title": "Align-RUDDER: Learning From Few Demonstrations by Reward Redistribution", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17807", "id": "17807", "proceeding": "https://proceedings.mlr.press/v162/patil22a.html", "poster": "/media/PosterPDFs/ICML%202022/01daa090f0d5693d97c90755a54fa204.png?t=1657702913.3637087", "slides": "", "author_site": "Vihang Patil, Markus Hofmarcher, Marius-Constantin Dinu, Matthias Dorfer, Patrick Blies, Johannes Brandstetter, Jose A. Arjona-Medina, Sepp Hochreiter", "author": "Vihang Patil; Markus Hofmarcher; Marius-Constantin Dinu; Matthias Dorfer; Patrick M Blies; Johannes Brandstetter; Jos\u00e9 Arjona-Medina; Sepp Hochreiter", "abstract": "Reinforcement learning algorithms require many samples when solving complex hierarchical tasks with sparse and delayed rewards. For such complex tasks, the recently proposed RUDDER uses reward redistribution to leverage steps in the Q-function that are associated with accomplishing sub-tasks. However, often only few episodes with high rewards are available as demonstrations since current exploration strategies cannot discover them in reasonable time. In this work, we introduce Align-RUDDER, which utilizes a profile model for reward redistribution that is obtained from multiple sequence alignment of demonstrations. Consequently, Align-RUDDER employs reward redistribution effectively and, thereby, drastically improves learning on few demonstrations. Align-RUDDER outperforms competitors on complex artificial tasks with delayed rewards and few demonstrations. On the Minecraft ObtainDiamond task, Align-RUDDER is able to mine a diamond, though not frequently. Code is available at github.com/ml-jku/align-rudder.", "bibtex": "@InProceedings{pmlr-v162-patil22a,\n title = \t {Align-{RUDDER}: Learning From Few Demonstrations by Reward Redistribution},\n author = {Patil, Vihang and Hofmarcher, Markus and Dinu, Marius-Constantin and Dorfer, Matthias and Blies, Patrick M and Brandstetter, Johannes and Arjona-Medina, Jos{\\'e} and Hochreiter, Sepp},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17531--17572},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/patil22a/patil22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/patil22a.html},\n abstract = \t {Reinforcement learning algorithms require many samples when solving complex hierarchical tasks with sparse and delayed rewards. For such complex tasks, the recently proposed RUDDER uses reward redistribution to leverage steps in the Q-function that are associated with accomplishing sub-tasks. However, often only few episodes with high rewards are available as demonstrations since current exploration strategies cannot discover them in reasonable time. In this work, we introduce Align-RUDDER, which utilizes a profile model for reward redistribution that is obtained from multiple sequence alignment of demonstrations. Consequently, Align-RUDDER employs reward redistribution effectively and, thereby, drastically improves learning on few demonstrations. Align-RUDDER outperforms competitors on complex artificial tasks with delayed rewards and few demonstrations. On the Minecraft ObtainDiamond task, Align-RUDDER is able to mine a diamond, though not frequently. Code is available at github.com/ml-jku/align-rudder.}\n}", "pdf": "https://proceedings.mlr.press/v162/patil22a/patil22a.pdf", "supp": "", "pdf_size": 9121539, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17099796649634976721&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "ELLIS Unit Linz and LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz+Dynatrace Research; ELLIS Unit Linz and LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz+enliteAI; ELLIS Unit Linz and LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz; enliteAI; enliteAI; ELLIS Unit Linz and LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz+Now at Microsoft Research; ELLIS Unit Linz and LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz+Dynatrace Research; ELLIS Unit Linz and LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz+Institute of Advanced Research in Artificial Intelligence", "aff_domain": "ml.jku.at; ; ; ; ; ; ; ", "email": "ml.jku.at; ; ; ; ; ; ; ", "github": "github.com/ml-jku/align-rudder", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/patil22a.html", "aff_unique_index": "0+1;0+2;0;2;2;0+3;0+1;0+4", "aff_unique_norm": "Johannes Kepler University Linz;Dynatrace;enliteAI;Microsoft;Institute of Advanced Research in Artificial Intelligence", "aff_unique_dep": "Institute for Machine Learning;Research;;Microsoft Research;", "aff_unique_url": "https://www.jku.at;https://www.dynatrace.com;;https://www.microsoft.com/en-us/research;", "aff_unique_abbr": "JKU;Dynatrace;;MSR;", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Linz;", "aff_country_unique_index": "0+1;0;0;0+1;0+1;0", "aff_country_unique": "Austria;United States;" }, { "title": "An Analytical Update Rule for General Policy Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16719", "id": "16719", "proceeding": "https://proceedings.mlr.press/v162/li22d.html", "poster": "/media/PosterPDFs/ICML%202022/e1ab840a08f6e72d3baf13622bef60ad.png?t=1657643978.681007", "slides": "", "author_site": "Hepeng Li, Nicholas Clavette, Haibo He", "author": "Hepeng Li; Nicholas Clavette; Haibo He", "abstract": "We present an analytical policy update rule that is independent of parametric function approximators. The policy update rule is suitable for optimizing general stochastic policies and has a monotonic improvement guarantee. It is derived from a closed-form solution to trust-region optimization using calculus of variation, following a new theoretical result that tightens existing bounds for policy improvement using trust-region methods. The update rule builds a connection between policy search methods and value function methods. Moreover, off-policy reinforcement learning algorithms can be derived from the update rule since it does not need to compute integration over on-policy states. In addition, the update rule extends immediately to cooperative multi-agent systems when policy updates are performed by one agent at a time.", "bibtex": "@InProceedings{pmlr-v162-li22d,\n title = \t {An Analytical Update Rule for General Policy Optimization},\n author = {Li, Hepeng and Clavette, Nicholas and He, Haibo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12696--12716},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22d/li22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22d.html},\n abstract = \t {We present an analytical policy update rule that is independent of parametric function approximators. The policy update rule is suitable for optimizing general stochastic policies and has a monotonic improvement guarantee. It is derived from a closed-form solution to trust-region optimization using calculus of variation, following a new theoretical result that tightens existing bounds for policy improvement using trust-region methods. The update rule builds a connection between policy search methods and value function methods. Moreover, off-policy reinforcement learning algorithms can be derived from the update rule since it does not need to compute integration over on-policy states. In addition, the update rule extends immediately to cooperative multi-agent systems when policy updates are performed by one agent at a time.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22d/li22d.pdf", "supp": "", "pdf_size": 401421, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13533887933940269980&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Electrical, Computer and Biomedical Engineering, University of Rhode Island, South Kingstown, RI, USA; Department of Electrical, Computer and Biomedical Engineering, University of Rhode Island, South Kingstown, RI, USA; Department of Electrical, Computer and Biomedical Engineering, University of Rhode Island, South Kingstown, RI, USA", "aff_domain": "uri.edu; ;uri.edu", "email": "uri.edu; ;uri.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/li22d.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Rhode Island", "aff_unique_dep": "Department of Electrical, Computer and Biomedical Engineering", "aff_unique_url": "https://www.uri.edu", "aff_unique_abbr": "URI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "South Kingstown", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "An Asymptotic Test for Conditional Independence using Analytic Kernel Embeddings", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17723", "id": "17723", "proceeding": "https://proceedings.mlr.press/v162/scetbon22a.html", "poster": "/media/PosterPDFs/ICML%202022/6775a0635c302542da2c32aa19d86be0.png?t=1657716734.9260912", "slides": "", "author_site": "Meyer Scetbon, Laurent Meunier, Yaniv Romano", "author": "Meyer Scetbon; Laurent Meunier; Yaniv Romano", "abstract": "We propose a new conditional dependence measure and a statistical test for conditional independence. The measure is based on the difference between analytic kernel embeddings of two well-suited distributions evaluated at a finite set of locations. We obtain its asymptotic distribution under the null hypothesis of conditional independence and design a consistent statistical test from it. We conduct a series of experiments showing that our new test outperforms state-of-the-art methods both in terms of type-I and type-II errors even in the high dimensional setting.", "bibtex": "@InProceedings{pmlr-v162-scetbon22a,\n title = \t {An Asymptotic Test for Conditional Independence using Analytic Kernel Embeddings},\n author = {Scetbon, Meyer and Meunier, Laurent and Romano, Yaniv},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19328--19346},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/scetbon22a/scetbon22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/scetbon22a.html},\n abstract = \t {We propose a new conditional dependence measure and a statistical test for conditional independence. The measure is based on the difference between analytic kernel embeddings of two well-suited distributions evaluated at a finite set of locations. We obtain its asymptotic distribution under the null hypothesis of conditional independence and design a consistent statistical test from it. We conduct a series of experiments showing that our new test outperforms state-of-the-art methods both in terms of type-I and type-II errors even in the high dimensional setting.}\n}", "pdf": "https://proceedings.mlr.press/v162/scetbon22a/scetbon22a.pdf", "supp": "", "pdf_size": 1103983, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14026015450757796884&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "CREST, ENSAE, France+Facebook AI Research, Paris, France+Universit\u00e9 Paris-Dauphine, France; CREST, ENSAE, France+Facebook AI Research, Paris, France+Universit\u00e9 Paris-Dauphine, France; Departments of Electrical and Computer Engineering and of Computer Science, Technion, Israel", "aff_domain": "ensae.fr; ; ", "email": "ensae.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/scetbon22a.html", "aff_unique_index": "0+1+2;0+1+2;3", "aff_unique_norm": "CREST;Meta;Universit\u00e9 Paris-Dauphine;Technion", "aff_unique_dep": ";Facebook AI Research;;Departments of Electrical and Computer Engineering and of Computer Science", "aff_unique_url": ";https://research.facebook.com;https://www.univ-paris-dauphine.fr;https://www.technion.ac.il", "aff_unique_abbr": ";FAIR;UPD;Technion", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0+0+0;0+0+0;1", "aff_country_unique": "France;Israel" }, { "title": "An Equivalence Between Data Poisoning and Byzantine Gradient Attacks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17453", "id": "17453", "proceeding": "https://proceedings.mlr.press/v162/farhadkhani22b.html", "poster": "/media/PosterPDFs/ICML%202022/f076073b2082f8741a9cd07b789c77a0.png?t=1657701213.8000708", "slides": "", "author_site": "Sadegh Farhadkhani, Rachid Guerraoui, L\u00ea-Nguy\u00ean Hoang, Oscar Villemaud", "author": "Sadegh Farhadkhani; Rachid Guerraoui; L\u00ea Nguy\u00ean Hoang; Oscar Villemaud", "abstract": "To study the resilience of distributed learning, the \u201cByzantine\" literature considers a strong threat model where workers can report arbitrary gradients to the parameter server. Whereas this model helped obtain several fundamental results, it has sometimes been considered unrealistic, when the workers are mostly trustworthy machines. In this paper, we show a surprising equivalence between this model and data poisoning, a threat considered much more realistic. More specifically, we prove that every gradient attack can be reduced to data poisoning, in any personalized federated learning system with PAC guarantees (which we show are both desirable and realistic). This equivalence makes it possible to obtain new impossibility results on the resilience of", "bibtex": "@InProceedings{pmlr-v162-farhadkhani22b,\n title = \t {An Equivalence Between Data Poisoning and {B}yzantine Gradient Attacks},\n author = {Farhadkhani, Sadegh and Guerraoui, Rachid and Hoang, L{\\^e} Nguy{\\^e}n and Villemaud, Oscar},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6284--6323},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/farhadkhani22b/farhadkhani22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/farhadkhani22b.html},\n abstract = \t {To study the resilience of distributed learning, the \u201cByzantine\" literature considers a strong threat model where workers can report arbitrary gradients to the parameter server. Whereas this model helped obtain several fundamental results, it has sometimes been considered unrealistic, when the workers are mostly trustworthy machines. In this paper, we show a surprising equivalence between this model and data poisoning, a threat considered much more realistic. More specifically, we prove that every gradient attack can be reduced to data poisoning, in any personalized federated learning system with PAC guarantees (which we show are both desirable and realistic). This equivalence makes it possible to obtain new impossibility results on the resilience of", "pdf": "https://proceedings.mlr.press/v162/farhadkhani22b/farhadkhani22b.pdf", "supp": "", "pdf_size": 2099365, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15814948581438408162&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "IC Schoold, EPFL, Lausanne, Switzerland; IC Schoold, EPFL, Lausanne, Switzerland; IC Schoold, EPFL, Lausanne, Switzerland; IC Schoold, EPFL, Lausanne, Switzerland", "aff_domain": "epfl.ch; ;epfl.ch; ", "email": "epfl.ch; ;epfl.ch; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/farhadkhani22b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "IC Schoold", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "An Exact Symbolic Reduction of Linear Smart Predict+Optimize to Mixed Integer Linear Programming", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18287", "id": "18287", "proceeding": "https://proceedings.mlr.press/v162/jeong22a.html", "poster": "/media/PosterPDFs/ICML%202022/5637f327937ff2beb7d0a499a0b99d3c.png?t=1657772297.708138", "slides": "", "author_site": "Jihwan Jeong, Parth Jaggi, Andrew Butler, Scott Sanner", "author": "Jihwan Jeong; Parth Jaggi; Andrew Butler; Scott Sanner", "abstract": "Predictive models are traditionally optimized independently of their use in downstream decision-based optimization. The \u2018smart, predict then optimize\u2019 (SPO) framework addresses this shortcoming by optimizing predictive models in order to", "bibtex": "@InProceedings{pmlr-v162-jeong22a,\n title = \t {An Exact Symbolic Reduction of Linear Smart {P}redict+{O}ptimize to Mixed Integer Linear Programming},\n author = {Jeong, Jihwan and Jaggi, Parth and Butler, Andrew and Sanner, Scott},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10053--10067},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jeong22a/jeong22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jeong22a.html},\n abstract = \t {Predictive models are traditionally optimized independently of their use in downstream decision-based optimization. The \u2018smart, predict then optimize\u2019 (SPO) framework addresses this shortcoming by optimizing predictive models in order to", "pdf": "https://proceedings.mlr.press/v162/jeong22a/jeong22a.pdf", "supp": "", "pdf_size": 780217, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8103894190667397922&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Department of Mechanical and Industrial Engineering, University of Toronto, Toronto, Canada+Vector Institute, Toronto, Canada; Department of Mechanical and Industrial Engineering, University of Toronto, Toronto, Canada; Department of Mechanical and Industrial Engineering, University of Toronto, Toronto, Canada; Department of Mechanical and Industrial Engineering, University of Toronto, Toronto, Canada+Vector Institute, Toronto, Canada", "aff_domain": "mie.utoronto.ca; ; ; ", "email": "mie.utoronto.ca; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/jeong22a.html", "aff_unique_index": "0+1;0;0;0+1", "aff_unique_norm": "University of Toronto;Vector Institute", "aff_unique_dep": "Department of Mechanical and Industrial Engineering;", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai", "aff_unique_abbr": "U of T;Vector Institute", "aff_campus_unique_index": "0+0;0;0;0+0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0+0;0;0;0+0", "aff_country_unique": "Canada" }, { "title": "An Initial Alignment between Neural Network and Target is Needed for Gradient Descent to Learn", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18035", "id": "18035", "proceeding": "https://proceedings.mlr.press/v162/abbe22a.html", "poster": "/media/PosterPDFs/ICML%202022/2e92962c0b6996add9517e4242ea9bdc.png?t=1657464658.3703907", "slides": "", "author_site": "Emmanuel Abbe, Elisabetta Cornacchia, Jan Hazla, Christopher Marquis", "author": "Emmanuel Abbe; Elisabetta Cornacchia; Jan Hazla; Christopher Marquis", "abstract": "This paper introduces the notion of \u201cInitial Alignment\u201d (INAL) between a neural network at initialization and a target function. It is proved that if a network and a Boolean target function do not have a noticeable INAL, then noisy gradient descent with normalized i.i.d. initialization will not learn in polynomial time. Thus a certain amount of knowledge about the target (measured by the INAL) is needed in the architecture design. This also provides an answer to an open problem posed in (AS-NeurIPS\u201920). The results are based on deriving lower-bounds for descent algorithms on symmetric neural networks without explicit knowledge of the target function beyond its INAL.", "bibtex": "@InProceedings{pmlr-v162-abbe22a,\n title = \t {An Initial Alignment between Neural Network and Target is Needed for Gradient Descent to Learn},\n author = {Abbe, Emmanuel and Cornacchia, Elisabetta and Hazla, Jan and Marquis, Christopher},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {33--52},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/abbe22a/abbe22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/abbe22a.html},\n abstract = \t {This paper introduces the notion of \u201cInitial Alignment\u201d (INAL) between a neural network at initialization and a target function. It is proved that if a network and a Boolean target function do not have a noticeable INAL, then noisy gradient descent with normalized i.i.d. initialization will not learn in polynomial time. Thus a certain amount of knowledge about the target (measured by the INAL) is needed in the architecture design. This also provides an answer to an open problem posed in (AS-NeurIPS\u201920). The results are based on deriving lower-bounds for descent algorithms on symmetric neural networks without explicit knowledge of the target function beyond its INAL.}\n}", "pdf": "https://proceedings.mlr.press/v162/abbe22a/abbe22a.pdf", "supp": "", "pdf_size": 422195, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4878289490978629986&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Institute of Mathematics, EPFL, Lausanne, Switzerland; Institute of Mathematics, EPFL, Lausanne, Switzerland; Institute of Mathematics, EPFL, Lausanne, Switzerland + African Institute for Mathematical Sciences (AIMS), Kigali, Rwanda; Institute of Mathematics, EPFL, Lausanne, Switzerland", "aff_domain": "epfl.ch;epfl.ch;epfl.ch;epfl.ch", "email": "epfl.ch;epfl.ch;epfl.ch;epfl.ch", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/abbe22a.html", "aff_unique_index": "0;0;0+1;0", "aff_unique_norm": "EPFL;African Institute for Mathematical Sciences", "aff_unique_dep": "Institute of Mathematics;", "aff_unique_url": "https://www.epfl.ch;https://www.aims.ac.rw", "aff_unique_abbr": "EPFL;AIMS", "aff_campus_unique_index": "0;0;0+1;0", "aff_campus_unique": "Lausanne;Kigali", "aff_country_unique_index": "0;0;0+1;0", "aff_country_unique": "Switzerland;Rwanda" }, { "title": "An Intriguing Property of Geophysics Inversion", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17775", "id": "17775", "proceeding": "https://proceedings.mlr.press/v162/feng22a.html", "poster": "/media/PosterPDFs/ICML%202022/86b48b560a92d18429bcfca2c70ee733.png?t=1657683145.8565774", "slides": "", "author_site": "Yinan Feng, Yinpeng Chen, Shihang Feng, Peng Jin, Zicheng Liu, Youzuo Lin", "author": "Yinan Feng; Yinpeng Chen; Shihang Feng; Peng Jin; Zicheng Liu; Youzuo Lin", "abstract": "Inversion techniques are widely used to reconstruct subsurface physical properties (e.g., velocity, conductivity) from surface-based geophysical measurements (e.g., seismic, electric/magnetic (EM) data). The problems are governed by partial differential equations (PDEs) like the wave or Maxwell\u2019s equations. Solving geophysical inversion problems is challenging due to the ill-posedness and high computational cost. To alleviate those issues, recent studies leverage deep neural networks to learn the inversion mappings from measurements to the property directly. In this paper, we show that such a mapping can be well modeled by a very shallow (but not wide) network with only five layers. This is achieved based on our new finding of an intriguing property: a near-linear relationship between the input and output, after applying integral transform in high dimensional space. In particular, when dealing with the inversion from seismic data to subsurface velocity governed by a wave equation, the integral results of velocity with Gaussian kernels are linearly correlated to the integral of seismic data with sine kernels. Furthermore, this property can be easily turned into a light-weight encoder-decoder network for inversion. The encoder contains the integration of seismic data and the linear transformation without need for fine-tuning. The decoder only consists of a single transformer block to reverse the integral of velocity. Experiments show that this interesting property holds for two geophysics inversion problems over four different datasets. Compared to much deeper InversionNet, our method achieves comparable accuracy, but consumes significantly fewer parameters", "bibtex": "@InProceedings{pmlr-v162-feng22a,\n title = \t {An Intriguing Property of Geophysics Inversion},\n author = {Feng, Yinan and Chen, Yinpeng and Feng, Shihang and Jin, Peng and Liu, Zicheng and Lin, Youzuo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6434--6446},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/feng22a/feng22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/feng22a.html},\n abstract = \t {Inversion techniques are widely used to reconstruct subsurface physical properties (e.g., velocity, conductivity) from surface-based geophysical measurements (e.g., seismic, electric/magnetic (EM) data). The problems are governed by partial differential equations (PDEs) like the wave or Maxwell\u2019s equations. Solving geophysical inversion problems is challenging due to the ill-posedness and high computational cost. To alleviate those issues, recent studies leverage deep neural networks to learn the inversion mappings from measurements to the property directly. In this paper, we show that such a mapping can be well modeled by a very shallow (but not wide) network with only five layers. This is achieved based on our new finding of an intriguing property: a near-linear relationship between the input and output, after applying integral transform in high dimensional space. In particular, when dealing with the inversion from seismic data to subsurface velocity governed by a wave equation, the integral results of velocity with Gaussian kernels are linearly correlated to the integral of seismic data with sine kernels. Furthermore, this property can be easily turned into a light-weight encoder-decoder network for inversion. The encoder contains the integration of seismic data and the linear transformation without need for fine-tuning. The decoder only consists of a single transformer block to reverse the integral of velocity. Experiments show that this interesting property holds for two geophysics inversion problems over four different datasets. Compared to much deeper InversionNet, our method achieves comparable accuracy, but consumes significantly fewer parameters}\n}", "pdf": "https://proceedings.mlr.press/v162/feng22a/feng22a.pdf", "supp": "", "pdf_size": 3571070, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10511521280594780133&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "aff": "Earth and Environmental Sciences Division, Los Alamos National Laboratory, USA+1; Microsoft Research, USA+2; Earth and Environmental Sciences Division, Los Alamos National Laboratory, USA+1; College of Information Sciences and Technology, The Pennsylvania State University, USA+3; Microsoft Research, USA+2; Earth and Environmental Sciences Division, Los Alamos National Laboratory, USA+1", "aff_domain": "lanl.gov;microsoft.com;lanl.gov;psu.edu;microsoft.com;lanl.gov", "email": "lanl.gov;microsoft.com;lanl.gov;psu.edu;microsoft.com;lanl.gov", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/feng22a.html", "aff_unique_index": "0;2;0;3;2;0", "aff_unique_norm": "Los Alamos National Laboratory;;Microsoft;Pennsylvania State University", "aff_unique_dep": "Earth and Environmental Sciences Division;;Microsoft Research;College of Information Sciences and Technology", "aff_unique_url": "https://www.lanl.gov;;https://www.microsoft.com/en-us/research;https://www.psu.edu", "aff_unique_abbr": "LANL;;MSR;PSU", "aff_campus_unique_index": ";;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "title": "An iterative clustering algorithm for the Contextual Stochastic Block Model with optimality guarantees", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16027", "id": "16027", "proceeding": "https://proceedings.mlr.press/v162/braun22a.html", "poster": "/media/PosterPDFs/ICML%202022/24f2f931f12a4d9149876a5bef93e96a.png?t=1656593586.8849807", "slides": "", "author_site": "Guillaume Braun, Hemant Tyagi, Christophe Biernacki", "author": "Guillaume Braun; Hemant Tyagi; Christophe Biernacki", "abstract": "Real-world networks often come with side information that can help to improve the performance of network analysis tasks such as clustering. Despite a large number of empirical and theoretical studies conducted on network clustering methods during the past decade, the added value of side information and the methods used to incorporate it optimally in clustering algorithms are relatively less understood. We propose a new iterative algorithm to cluster networks with side information for nodes (in the form of covariates) and show that our algorithm is optimal under the Contextual Symmetric Stochastic Block Model. Our algorithm can be applied to general Contextual Stochastic Block Models and avoids hyperparameter tuning in contrast to previously proposed methods. We confirm our theoretical results on synthetic data experiments where our algorithm significantly outperforms other methods, and show that it can also be applied to signed graphs. Finally we demonstrate the practical interest of our method on real data.", "bibtex": "@InProceedings{pmlr-v162-braun22a,\n title = \t {An iterative clustering algorithm for the Contextual Stochastic Block Model with optimality guarantees},\n author = {Braun, Guillaume and Tyagi, Hemant and Biernacki, Christophe},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2257--2291},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/braun22a/braun22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/braun22a.html},\n abstract = \t {Real-world networks often come with side information that can help to improve the performance of network analysis tasks such as clustering. Despite a large number of empirical and theoretical studies conducted on network clustering methods during the past decade, the added value of side information and the methods used to incorporate it optimally in clustering algorithms are relatively less understood. We propose a new iterative algorithm to cluster networks with side information for nodes (in the form of covariates) and show that our algorithm is optimal under the Contextual Symmetric Stochastic Block Model. Our algorithm can be applied to general Contextual Stochastic Block Models and avoids hyperparameter tuning in contrast to previously proposed methods. We confirm our theoretical results on synthetic data experiments where our algorithm significantly outperforms other methods, and show that it can also be applied to signed graphs. Finally we demonstrate the practical interest of our method on real data.}\n}", "pdf": "https://proceedings.mlr.press/v162/braun22a/braun22a.pdf", "supp": "", "pdf_size": 1506040, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10661970010431064554&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Inria, Universit \u00b4e de Lille, CNRS, Laboratoire de math \u00b4ematiques Painlev \u00b4e, 59650 Villeneuve d\u2019Ascq, France; Inria, Universit \u00b4e de Lille, CNRS, Laboratoire de math \u00b4ematiques Painlev \u00b4e, 59650 Villeneuve d\u2019Ascq, France; Inria, Universit \u00b4e de Lille, CNRS, Laboratoire de math \u00b4ematiques Painlev \u00b4e, 59650 Villeneuve d\u2019Ascq, France", "aff_domain": "inria.fr; ; ", "email": "inria.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/braun22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "INRIA", "aff_unique_dep": "Laboratoire de math\u00e9matiques Painlev\u00e9", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "Inria", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Analysis of Stochastic Processes through Replay Buffers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16377", "id": "16377", "proceeding": "https://proceedings.mlr.press/v162/di-castro22a.html", "poster": "/media/PosterPDFs/ICML%202022/1b9e43c170cd3fc59624a18663b8d4d2.png?t=1657558805.6907482", "slides": "", "author_site": "Shirli Di-Castro Shashua, Shie Mannor, Dotan Di Castro", "author": "Shirli Di-Castro; Shie Mannor; Dotan Di Castro", "abstract": "Replay buffers are a key component in many reinforcement learning schemes. Yet, their theoretical properties are not fully understood. In this paper we analyze a system where a stochastic process X is pushed into a replay buffer and then randomly sampled to generate a stochastic process Y from the replay buffer. We provide an analysis of the properties of the sampled process such as stationarity, Markovity and autocorrelation in terms of the properties of the original process. Our theoretical analysis sheds light on why replay buffer may be a good de-correlator. Our analysis provides theoretical tools for proving the convergence of replay buffer based algorithms which are prevalent in reinforcement learning schemes.", "bibtex": "@InProceedings{pmlr-v162-di-castro22a,\n title = \t {Analysis of Stochastic Processes through Replay Buffers},\n author = {Di-Castro, Shirli and Mannor, Shie and Castro, Dotan Di},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5039--5060},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/di-castro22a/di-castro22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/di-castro22a.html},\n abstract = \t {Replay buffers are a key component in many reinforcement learning schemes. Yet, their theoretical properties are not fully understood. In this paper we analyze a system where a stochastic process X is pushed into a replay buffer and then randomly sampled to generate a stochastic process Y from the replay buffer. We provide an analysis of the properties of the sampled process such as stationarity, Markovity and autocorrelation in terms of the properties of the original process. Our theoretical analysis sheds light on why replay buffer may be a good de-correlator. Our analysis provides theoretical tools for proving the convergence of replay buffer based algorithms which are prevalent in reinforcement learning schemes.}\n}", "pdf": "https://proceedings.mlr.press/v162/di-castro22a/di-castro22a.pdf", "supp": "", "pdf_size": 606266, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11719523885818645660&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Technion Institute of Technology, Haifa, Israel; Technion Institute of Technology, Haifa, Israel + NVIDIA Research, Israel; Bosch Center of AI, Haifa, Israel", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/di-castro22a.html", "aff_unique_index": "0;0+1;2", "aff_unique_norm": "Technion Institute of Technology;NVIDIA;Bosch Center of AI", "aff_unique_dep": ";Research;AI", "aff_unique_url": "https://www.technion.ac.il;https://research.nvidia.com;https://www.bosch.com/research/locations/haifa/", "aff_unique_abbr": "Technion;NVIDIA;BCAI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Haifa;", "aff_country_unique_index": "0;0+0;0", "aff_country_unique": "Israel" }, { "title": "Analyzing and Mitigating Interference in Neural Architecture Search", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16137", "id": "16137", "proceeding": "https://proceedings.mlr.press/v162/xu22h.html", "poster": "/media/PosterPDFs/ICML%202022/978fce5bcc4eccc88ad48ce3914124a2.png?t=1656744011.828574", "slides": "", "author_site": "Jin Xu, Xu Tan, Kaitao Song, Renqian Luo, Yichong Leng, Tao Qin, Tie-Yan Liu, Jian Li", "author": "Jin Xu; Xu Tan; Kaitao Song; Renqian Luo; Yichong Leng; Tao Qin; Tie-Yan Liu; Jian Li", "abstract": "Weight sharing is a popular approach to reduce the training cost of neural architecture search (NAS) by reusing the weights of shared operators from previously trained child models. However, the rank correlation between the estimated accuracy and ground truth accuracy of those child models is low due to the interference among different child models caused by weight sharing. In this paper, we investigate the interference issue by sampling different child models and calculating the gradient similarity of shared operators, and observe that: 1) the interference on a shared operator between two child models is positively correlated with the number of different operators between them; 2) the interference is smaller when the inputs and outputs of the shared operator are more similar. Inspired by these two observations, we propose two approaches to mitigate the interference: 1) rather than randomly sampling child models for optimization, we propose a gradual modification scheme by modifying one operator between adjacent optimization steps to minimize the interference on the shared operators; 2) forcing the inputs and outputs of the operator across all child models to be similar to reduce the interference. Experiments on a BERT search space verify that mitigating interference via each of our proposed methods improves the rank correlation of super-pet and combining both methods can achieve better results. Our discovered architecture outperforms RoBERTa$_{\\rm base}$ by 1.1 and 0.6 points and ELECTRA$_{\\rm base}$ by 1.6 and 1.1 points on the dev and test set of GLUE benchmark. Extensive results on the BERT compression, reading comprehension and large-scale image classification tasks also demonstrate the effectiveness and generality of our proposed methods.", "bibtex": "@InProceedings{pmlr-v162-xu22h,\n title = \t {Analyzing and Mitigating Interference in Neural Architecture Search},\n author = {Xu, Jin and Tan, Xu and Song, Kaitao and Luo, Renqian and Leng, Yichong and Qin, Tao and Liu, Tie-Yan and Li, Jian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24646--24662},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22h/xu22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22h.html},\n abstract = \t {Weight sharing is a popular approach to reduce the training cost of neural architecture search (NAS) by reusing the weights of shared operators from previously trained child models. However, the rank correlation between the estimated accuracy and ground truth accuracy of those child models is low due to the interference among different child models caused by weight sharing. In this paper, we investigate the interference issue by sampling different child models and calculating the gradient similarity of shared operators, and observe that: 1) the interference on a shared operator between two child models is positively correlated with the number of different operators between them; 2) the interference is smaller when the inputs and outputs of the shared operator are more similar. Inspired by these two observations, we propose two approaches to mitigate the interference: 1) rather than randomly sampling child models for optimization, we propose a gradual modification scheme by modifying one operator between adjacent optimization steps to minimize the interference on the shared operators; 2) forcing the inputs and outputs of the operator across all child models to be similar to reduce the interference. Experiments on a BERT search space verify that mitigating interference via each of our proposed methods improves the rank correlation of super-pet and combining both methods can achieve better results. Our discovered architecture outperforms RoBERTa$_{\\rm base}$ by 1.1 and 0.6 points and ELECTRA$_{\\rm base}$ by 1.6 and 1.1 points on the dev and test set of GLUE benchmark. Extensive results on the BERT compression, reading comprehension and large-scale image classification tasks also demonstrate the effectiveness and generality of our proposed methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22h/xu22h.pdf", "supp": "", "pdf_size": 4607040, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff": "Institute for Interdisciplinary Information Sciences (IIIS), Tsinghua University; Microsoft Research Asia; Microsoft Research Asia; Institute for Interdisciplinary Information Sciences (IIIS), Tsinghua University; Microsoft Research Asia+University of Science and Technology of China; Microsoft Research Asia; Microsoft Research Asia; Institute for Interdisciplinary Information Sciences (IIIS), Tsinghua University", "aff_domain": "microsoft.com;microsoft.com; ;mail.tsinghua.edu.cn; ; ; ;mail.tsinghua.edu.cn", "email": "microsoft.com;microsoft.com; ;mail.tsinghua.edu.cn; ; ; ;mail.tsinghua.edu.cn", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/xu22h.html", "aff_unique_index": "0;1;1;0;1+2;1;1;0", "aff_unique_norm": "Tsinghua University;Microsoft;University of Science and Technology of China", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences (IIIS);Research;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research/group/asia;http://www.ustc.edu.cn", "aff_unique_abbr": "Tsinghua;MSR Asia;USTC", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;0+0;0;0;0", "aff_country_unique": "China" }, { "title": "Anarchic Federated Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17061", "id": "17061", "proceeding": "https://proceedings.mlr.press/v162/yang22r.html", "poster": "/media/PosterPDFs/ICML%202022/23ce1851341ec1fa9e0c259de10bf87c_L1fX4Ow.png?t=1658123016.435583", "slides": "/media/icml-2022/Slides/17061_kg1Vbwp.pdf", "author_site": "Haibo Yang, Xin Zhang, Prashant Khanduri, Jia Liu", "author": "Haibo Yang; Xin Zhang; Prashant Khanduri; Jia Liu", "abstract": "Present-day federated learning (FL) systems deployed over edge networks consists of a large number of workers with high degrees of heterogeneity in data and/or computing capabilities, which call for flexible worker participation in terms of timing, effort, data heterogeneity, etc. To satisfy the need for flexible worker participation, we consider a new FL paradigm called \u201cAnarchic Federated Learning\u201d (AFL) in this paper. In stark contrast to conventional FL models, each worker in AFL has the freedom to choose i) when to participate in FL, and ii) the number of local steps to perform in each round based on its current situation (e.g., battery level, communication channels, privacy concerns). However, such chaotic worker behaviors in AFL impose many new open questions in algorithm design. In particular, it remains unclear whether one could develop convergent AFL training algorithms, and if yes, under what conditions and how fast the achievable convergence speed is. Toward this end, we propose two Anarchic Federated Averaging (AFA) algorithms with two-sided learning rates for both cross-device and cross-silo settings, which are named AFA-CD and AFA-CS, respectively. Somewhat surprisingly, we show that, under mild anarchic assumptions, both AFL algorithms achieve the best known convergence rate as the state-of-the-art algorithms for conventional FL. Moreover, they retain the highly desirable", "bibtex": "@InProceedings{pmlr-v162-yang22r,\n title = \t {Anarchic Federated Learning},\n author = {Yang, Haibo and Zhang, Xin and Khanduri, Prashant and Liu, Jia},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25331--25363},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22r/yang22r.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22r.html},\n abstract = \t {Present-day federated learning (FL) systems deployed over edge networks consists of a large number of workers with high degrees of heterogeneity in data and/or computing capabilities, which call for flexible worker participation in terms of timing, effort, data heterogeneity, etc. To satisfy the need for flexible worker participation, we consider a new FL paradigm called \u201cAnarchic Federated Learning\u201d (AFL) in this paper. In stark contrast to conventional FL models, each worker in AFL has the freedom to choose i) when to participate in FL, and ii) the number of local steps to perform in each round based on its current situation (e.g., battery level, communication channels, privacy concerns). However, such chaotic worker behaviors in AFL impose many new open questions in algorithm design. In particular, it remains unclear whether one could develop convergent AFL training algorithms, and if yes, under what conditions and how fast the achievable convergence speed is. Toward this end, we propose two Anarchic Federated Averaging (AFA) algorithms with two-sided learning rates for both cross-device and cross-silo settings, which are named AFA-CD and AFA-CS, respectively. Somewhat surprisingly, we show that, under mild anarchic assumptions, both AFL algorithms achieve the best known convergence rate as the state-of-the-art algorithms for conventional FL. Moreover, they retain the highly desirable", "pdf": "https://proceedings.mlr.press/v162/yang22r/yang22r.pdf", "supp": "", "pdf_size": 708303, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6853314649307559893&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/yang22r.html" }, { "title": "Antibody-Antigen Docking and Design via Hierarchical Structure Refinement", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16625", "id": "16625", "proceeding": "https://proceedings.mlr.press/v162/jin22a.html", "poster": "/media/PosterPDFs/ICML%202022/b7f520a55897b35e6eb462bbf80915c6.png?t=1658193433.296417", "slides": "", "author_site": "Wengong Jin, Regina Barzilay, Tommi Jaakkola", "author": "Wengong Jin; Dr.Regina Barzilay; Tommi Jaakkola", "abstract": "Computational antibody design seeks to automatically create an antibody that binds to an antigen. The binding affinity is governed by the 3D binding interface where antibody residues (paratope) closely interact with antigen residues (epitope). Thus, the key question of antibody design is how to predict the 3D paratope-epitope complex (i.e., docking) for paratope generation. In this paper, we propose a new model called Hierarchical Structure Refinement Network (HSRN) for paratope docking and design. During docking, HSRN employs a hierarchical message passing network to predict atomic forces and use them to refine a binding complex in an iterative, equivariant manner. During generation, its autoregressive decoder progressively docks generated paratopes and builds a geometric representation of the binding interface to guide the next residue choice. Our results show that HSRN significantly outperforms prior state-of-the-art on paratope docking and design benchmarks.", "bibtex": "@InProceedings{pmlr-v162-jin22a,\n title = \t {Antibody-Antigen Docking and Design via Hierarchical Structure Refinement},\n author = {Jin, Wengong and Barzilay, Dr.Regina and Jaakkola, Tommi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10217--10227},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jin22a/jin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jin22a.html},\n abstract = \t {Computational antibody design seeks to automatically create an antibody that binds to an antigen. The binding affinity is governed by the 3D binding interface where antibody residues (paratope) closely interact with antigen residues (epitope). Thus, the key question of antibody design is how to predict the 3D paratope-epitope complex (i.e., docking) for paratope generation. In this paper, we propose a new model called Hierarchical Structure Refinement Network (HSRN) for paratope docking and design. During docking, HSRN employs a hierarchical message passing network to predict atomic forces and use them to refine a binding complex in an iterative, equivariant manner. During generation, its autoregressive decoder progressively docks generated paratopes and builds a geometric representation of the binding interface to guide the next residue choice. Our results show that HSRN significantly outperforms prior state-of-the-art on paratope docking and design benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/jin22a/jin22a.pdf", "supp": "", "pdf_size": 5967403, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9061013714538857266&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Eric and Wendy Schmidt Center, Broad Institute of MIT and Harvard; CSAIL, Massachusetts Institute of Technology; CSAIL, Massachusetts Institute of Technology", "aff_domain": "csail.mit.edu; ; ", "email": "csail.mit.edu; ; ", "github": "github.com/wengong-jin/abdockgen", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/jin22a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Broad Institute of MIT and Harvard;Massachusetts Institute of Technology", "aff_unique_dep": "Eric and Wendy Schmidt Center;Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.broadinstitute.org;https://www.csail.mit.edu", "aff_unique_abbr": "Broad Institute;MIT", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Anticorrelated Noise Injection for Improved Generalization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18421", "id": "18421", "proceeding": "https://proceedings.mlr.press/v162/orvieto22a.html", "poster": "/media/PosterPDFs/ICML%202022/0b7a9d54deeb611edc4540d286e9a042.png?t=1658137944.4295943", "slides": "", "author_site": "Antonio Orvieto, Hans Kersting, Frank Proske, Francis Bach, Aurelien Lucchi", "author": "Antonio Orvieto; Hans Kersting; Frank Proske; Francis Bach; Aurelien Lucchi", "abstract": "Injecting artificial noise into gradient descent (GD) is commonly employed to improve the performance of machine learning models. Usually, uncorrelated noise is used in such perturbed gradient descent (PGD) methods. It is, however, not known if this is optimal or whether other types of noise could provide better generalization performance. In this paper, we zoom in on the problem of correlating the perturbations of consecutive PGD steps. We consider a variety of objective functions for which we find that GD with anticorrelated perturbations (\"Anti-PGD\") generalizes significantly better than GD and standard (uncorrelated) PGD. To support these experimental findings, we also derive a theoretical analysis that demonstrates that Anti-PGD moves to wider minima, while GD and PGD remain stuck in suboptimal regions or even diverge. This new connection between anticorrelated noise and generalization opens the field to novel ways to exploit noise for training machine learning models.", "bibtex": "@InProceedings{pmlr-v162-orvieto22a,\n title = \t {Anticorrelated Noise Injection for Improved Generalization},\n author = {Orvieto, Antonio and Kersting, Hans and Proske, Frank and Bach, Francis and Lucchi, Aurelien},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17094--17116},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/orvieto22a/orvieto22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/orvieto22a.html},\n abstract = \t {Injecting artificial noise into gradient descent (GD) is commonly employed to improve the performance of machine learning models. Usually, uncorrelated noise is used in such perturbed gradient descent (PGD) methods. It is, however, not known if this is optimal or whether other types of noise could provide better generalization performance. In this paper, we zoom in on the problem of correlating the perturbations of consecutive PGD steps. We consider a variety of objective functions for which we find that GD with anticorrelated perturbations (\"Anti-PGD\") generalizes significantly better than GD and standard (uncorrelated) PGD. To support these experimental findings, we also derive a theoretical analysis that demonstrates that Anti-PGD moves to wider minima, while GD and PGD remain stuck in suboptimal regions or even diverge. This new connection between anticorrelated noise and generalization opens the field to novel ways to exploit noise for training machine learning models.}\n}", "pdf": "https://proceedings.mlr.press/v162/orvieto22a/orvieto22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/orvieto22a-supp.zip", "pdf_size": 4013276, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3285173644361398272&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, ETH Zurich, Switzerland; INRIA \u2013 Ecole Normale Sup\u00e9rieure \u2013 PSL Research University, Paris, France; Department of Mathematics, University of Oslo, Norway; INRIA \u2013 Ecole Normale Sup\u00e9rieure \u2013 PSL Research University, Paris, France; Department of Mathematics and Computer Science, University of Basel, Switzerland", "aff_domain": "inf.ethz.ch;inria.fr; ;inria.fr;unibas.ch", "email": "inf.ethz.ch;inria.fr; ;inria.fr;unibas.ch", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/orvieto22a.html", "aff_unique_index": "0;1;2;1;3", "aff_unique_norm": "ETH Zurich;INRIA;University of Oslo;University of Basel", "aff_unique_dep": "Department of Computer Science;;Department of Mathematics;Department of Mathematics and Computer Science", "aff_unique_url": "https://www.ethz.ch;https://www.inria.fr;https://www.uio.no;https://www.unibas.ch", "aff_unique_abbr": "ETHZ;INRIA;UiO;UniBas", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0;1;2;1;0", "aff_country_unique": "Switzerland;France;Norway" }, { "title": "AnyMorph: Learning Transferable Polices By Inferring Agent Morphology", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17811", "id": "17811", "proceeding": "https://proceedings.mlr.press/v162/trabucco22b.html", "poster": "/media/PosterPDFs/ICML%202022/8a7cf65139a9fbb34f03b046d8dc597c_lafO8PW.png?t=1657997006.079721", "slides": "", "author_site": "Brandon Trabucco, mariano phielipp, Glen Berseth", "author": "Brandon Trabucco; Mariano Phielipp; Glen Berseth", "abstract": "The prototypical approach to reinforcement learning involves training policies tailored to a particular agent from scratch for every new morphology. Recent work aims to eliminate the re-training of policies by investigating whether a morphology-agnostic policy, trained on a diverse set of agents with similar task objectives, can be transferred to new agents with unseen morphologies without re-training. This is a challenging problem that required previous approaches to use hand-designed descriptions of the new agent\u2019s morphology. Instead of hand-designing this description, we propose a data-driven method that learns a representation of morphology directly from the reinforcement learning objective. Ours is the first reinforcement learning algorithm that can train a policy to generalize to new agent morphologies without requiring a description of the agent\u2019s morphology in advance. We evaluate our approach on the standard benchmark for agent-agnostic control, and improve over the current state of the art in zero-shot generalization to new agents. Importantly, our method attains good performance without an explicit description of morphology.", "bibtex": "@InProceedings{pmlr-v162-trabucco22b,\n title = \t {{A}ny{M}orph: Learning Transferable Polices By Inferring Agent Morphology},\n author = {Trabucco, Brandon and Phielipp, Mariano and Berseth, Glen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21677--21691},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/trabucco22b/trabucco22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/trabucco22b.html},\n abstract = \t {The prototypical approach to reinforcement learning involves training policies tailored to a particular agent from scratch for every new morphology. Recent work aims to eliminate the re-training of policies by investigating whether a morphology-agnostic policy, trained on a diverse set of agents with similar task objectives, can be transferred to new agents with unseen morphologies without re-training. This is a challenging problem that required previous approaches to use hand-designed descriptions of the new agent\u2019s morphology. Instead of hand-designing this description, we propose a data-driven method that learns a representation of morphology directly from the reinforcement learning objective. Ours is the first reinforcement learning algorithm that can train a policy to generalize to new agent morphologies without requiring a description of the agent\u2019s morphology in advance. We evaluate our approach on the standard benchmark for agent-agnostic control, and improve over the current state of the art in zero-shot generalization to new agents. Importantly, our method attains good performance without an explicit description of morphology.}\n}", "pdf": "https://proceedings.mlr.press/v162/trabucco22b/trabucco22b.pdf", "supp": "", "pdf_size": 697612, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4438780222530743029&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Machine Learning Department, Carnegie Mellon University; Intel AI; Mila", "aff_domain": "cmu.edu; ; ", "email": "cmu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/trabucco22b.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Carnegie Mellon University;Intel;Mila", "aff_unique_dep": "Machine Learning Department;Intel AI;Quebec Artificial Intelligence Institute", "aff_unique_url": "https://www.cmu.edu;https://www.intel.com;https://mila.quebec", "aff_unique_abbr": "CMU;Intel;Mila", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Canada" }, { "title": "Anytime Information Cascade Popularity Prediction via Self-Exciting Processes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17757", "id": "17757", "proceeding": "https://proceedings.mlr.press/v162/zhang22a.html", "poster": "/media/PosterPDFs/ICML%202022/5e9d17e41f784ae361ada1d0817186f6.png?t=1657749231.8582492", "slides": "", "author_site": "Xi Zhang, Akshay Aravamudan, Georgios Anagnostopoulos", "author": "Xi Zhang; Akshay Aravamudan; Georgios C Anagnostopoulos", "abstract": "One important aspect of understanding behaviors of information cascades is to be able to accurately predict their popularity, that is, their message counts at any future time. Self-exciting Hawkes processes have been widely adopted for such tasks due to their success in describing cascading behaviors. In this paper, for general, marked Hawkes point processes, we present closed-form expressions for the mean and variance of future event counts, conditioned on observed events. Furthermore, these expressions allow us to develop a predictive approach, namely, Cascade Anytime Size Prediction via self-Exciting Regression model (CASPER), which is specifically tailored to popularity prediction, unlike existing generative approaches {\u2013} based on point processes {\u2013} for the same task. We showcase CASPER\u2019s merits via experiments entailing both synthetic and real-world data, and demonstrate that it considerably improves upon prior works in terms of accuracy, especially for early-stage prediction.", "bibtex": "@InProceedings{pmlr-v162-zhang22a,\n title = \t {Anytime Information Cascade Popularity Prediction via Self-Exciting Processes},\n author = {Zhang, Xi and Aravamudan, Akshay and Anagnostopoulos, Georgios C},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26028--26047},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22a/zhang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22a.html},\n abstract = \t {One important aspect of understanding behaviors of information cascades is to be able to accurately predict their popularity, that is, their message counts at any future time. Self-exciting Hawkes processes have been widely adopted for such tasks due to their success in describing cascading behaviors. In this paper, for general, marked Hawkes point processes, we present closed-form expressions for the mean and variance of future event counts, conditioned on observed events. Furthermore, these expressions allow us to develop a predictive approach, namely, Cascade Anytime Size Prediction via self-Exciting Regression model (CASPER), which is specifically tailored to popularity prediction, unlike existing generative approaches {\u2013} based on point processes {\u2013} for the same task. We showcase CASPER\u2019s merits via experiments entailing both synthetic and real-world data, and demonstrate that it considerably improves upon prior works in terms of accuracy, especially for early-stage prediction.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22a/zhang22a.pdf", "supp": "", "pdf_size": 865612, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1006025443768252302&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Engineering & Sciences, Florida Institute of Technology, Melbourne, FL, USA; Department of Computer Engineering & Sciences, Florida Institute of Technology, Melbourne, FL, USA; Department of Computer Engineering & Sciences, Florida Institute of Technology, Melbourne, FL, USA", "aff_domain": "my.fit.edu; ; ", "email": "my.fit.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhang22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Florida Institute of Technology", "aff_unique_dep": "Department of Computer Engineering & Sciences", "aff_unique_url": "https://www.fit.edu", "aff_unique_abbr": "FIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Melbourne", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Approximate Bayesian Computation with Domain Expert in the Loop", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17117", "id": "17117", "proceeding": "https://proceedings.mlr.press/v162/bharti22a.html", "poster": "/media/PosterPDFs/ICML%202022/363763e5c3dc3a68b399058c34aecf2c_vDn9CLm.png?t=1657795594.5741072", "slides": "", "author_site": "Ayush Bharti, Louis Filstroff, Samuel Kaski", "author": "Ayush Bharti; Louis Filstroff; Samuel Kaski", "abstract": "Approximate Bayesian computation (ABC) is a popular likelihood-free inference method for models with intractable likelihood functions. As ABC methods usually rely on comparing summary statistics of observed and simulated data, the choice of the statistics is crucial. This choice involves a trade-off between loss of information and dimensionality reduction, and is often determined based on domain knowledge. However, handcrafting and selecting suitable statistics is a laborious task involving multiple trial-and-error steps. In this work, we introduce an active learning method for ABC statistics selection which reduces the domain expert\u2019s work considerably. By involving the experts, we are able to handle misspecified models, unlike the existing dimension reduction methods. Moreover, empirical results show better posterior estimates than with existing methods, when the simulation budget is limited.", "bibtex": "@InProceedings{pmlr-v162-bharti22a,\n title = \t {Approximate {B}ayesian Computation with Domain Expert in the Loop},\n author = {Bharti, Ayush and Filstroff, Louis and Kaski, Samuel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1893--1905},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bharti22a/bharti22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bharti22a.html},\n abstract = \t {Approximate Bayesian computation (ABC) is a popular likelihood-free inference method for models with intractable likelihood functions. As ABC methods usually rely on comparing summary statistics of observed and simulated data, the choice of the statistics is crucial. This choice involves a trade-off between loss of information and dimensionality reduction, and is often determined based on domain knowledge. However, handcrafting and selecting suitable statistics is a laborious task involving multiple trial-and-error steps. In this work, we introduce an active learning method for ABC statistics selection which reduces the domain expert\u2019s work considerably. By involving the experts, we are able to handle misspecified models, unlike the existing dimension reduction methods. Moreover, empirical results show better posterior estimates than with existing methods, when the simulation budget is limited.}\n}", "pdf": "https://proceedings.mlr.press/v162/bharti22a/bharti22a.pdf", "supp": "", "pdf_size": 655572, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17515613516089862675&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "Department of Computer Science, Aalto University, Espoo, Finland+Department of Computer Science, University of Manchester, Manchester, United Kingdom; Department of Computer Science, Aalto University, Espoo, Finland; Department of Computer Science, Aalto University, Espoo, Finland+Department of Computer Science, University of Manchester, Manchester, United Kingdom", "aff_domain": "aalto.fi; ; ", "email": "aalto.fi; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/bharti22a.html", "aff_unique_index": "0+1;0;0+1", "aff_unique_norm": "Aalto University;University of Manchester", "aff_unique_dep": "Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.aalto.fi;https://www.manchester.ac.uk", "aff_unique_abbr": "Aalto;UoM", "aff_campus_unique_index": "0+1;0;0+1", "aff_campus_unique": "Espoo;Manchester", "aff_country_unique_index": "0+1;0;0+1", "aff_country_unique": "Finland;United Kingdom" }, { "title": "Approximate Frank-Wolfe Algorithms over Graph-structured Support Sets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17937", "id": "17937", "proceeding": "https://proceedings.mlr.press/v162/zhou22i.html", "poster": "/media/PosterPDFs/ICML%202022/0768281a05da9f27df178b5c39a51263.png?t=1658243001.777864", "slides": "", "author_site": "Baojian Zhou, Yifan Sun", "author": "Baojian Zhou; Yifan Sun", "abstract": "In this paper, we consider approximate Frank-Wolfe (FW) algorithms to solve convex optimization problems over graph-structured support sets where the linear minimization oracle (LMO) cannot be efficiently obtained in general. We first demonstrate that two popular approximation assumptions (additive and multiplicative gap errors) are not applicable in that no cheap gap-approximate LMO oracle exists. Thus, approximate dual maximization oracles (DMO) are proposed, which approximate the inner product rather than the gap. We prove that the standard FW method using a $\\delta$-approximate DMO converges as $O((1-\\delta) \\sqrt{s}/\\delta)$ in the worst case, and as $O(L/(\\delta^2 t))$ over a $\\delta$-relaxation of the constraint set. Furthermore, when the solution is on the boundary, a variant of FW converges as $O(1/t^2)$ under the quadratic growth assumption. Our empirical results suggest that even these improved bounds are pessimistic, showing fast convergence in recovering real-world images with graph-structured sparsity.", "bibtex": "@InProceedings{pmlr-v162-zhou22i,\n title = \t {Approximate Frank-{W}olfe Algorithms over Graph-structured Support Sets},\n author = {Zhou, Baojian and Sun, Yifan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27303--27337},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22i/zhou22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22i.html},\n abstract = \t {In this paper, we consider approximate Frank-Wolfe (FW) algorithms to solve convex optimization problems over graph-structured support sets where the linear minimization oracle (LMO) cannot be efficiently obtained in general. We first demonstrate that two popular approximation assumptions (additive and multiplicative gap errors) are not applicable in that no cheap gap-approximate LMO oracle exists. Thus, approximate dual maximization oracles (DMO) are proposed, which approximate the inner product rather than the gap. We prove that the standard FW method using a $\\delta$-approximate DMO converges as $O((1-\\delta) \\sqrt{s}/\\delta)$ in the worst case, and as $O(L/(\\delta^2 t))$ over a $\\delta$-relaxation of the constraint set. Furthermore, when the solution is on the boundary, a variant of FW converges as $O(1/t^2)$ under the quadratic growth assumption. Our empirical results suggest that even these improved bounds are pessimistic, showing fast convergence in recovering real-world images with graph-structured sparsity.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22i/zhou22i.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/zhou22i-supp.zip", "pdf_size": 1199914, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14741694353747789273&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Data Science, Fudan University, Shanghai, China; Department of Computer Science, Stony Brook University, Stony Brook, New York, USA", "aff_domain": "fudan.edu.cn; ", "email": "fudan.edu.cn; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/zhou22i.html", "aff_unique_index": "0;1", "aff_unique_norm": "Fudan University;Stony Brook University", "aff_unique_dep": "School of Data Science;Department of Computer Science", "aff_unique_url": "https://www.fudan.edu.cn;https://www.stonybrook.edu", "aff_unique_abbr": "Fudan;SBU", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Shanghai;Stony Brook", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Approximately Equivariant Networks for Imperfectly Symmetric Dynamics", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16949", "id": "16949", "proceeding": "https://proceedings.mlr.press/v162/wang22aa.html", "poster": "/media/PosterPDFs/ICML%202022/da4902cb0bc38210839714ebdcf0efc3.png?t=1657304089.2077928", "slides": "/media/icml-2022/Slides/16949.pdf", "author_site": "Rui Wang, Robin Walters, Rose Yu", "author": "Rui Wang; Robin Walters; Rose Yu", "abstract": "Incorporating symmetry as an inductive bias into neural network architecture has led to improvements in generalization, data efficiency, and physical consistency in dynamics modeling. Methods such as CNNs or equivariant neural networks use weight tying to enforce symmetries such as shift invariance or rotational equivariance. However, despite the fact that physical laws obey many symmetries, real-world dynamical data rarely conforms to strict mathematical symmetry either due to noisy or incomplete data or to symmetry breaking features in the underlying dynamical system. We explore approximately equivariant networks which are biased towards preserving symmetry but are not strictly constrained to do so. By relaxing equivariance constraints, we find that our models can outperform both baselines with no symmetry bias and baselines with overly strict symmetry in both simulated turbulence domains and real-world multi-stream jet flow.", "bibtex": "@InProceedings{pmlr-v162-wang22aa,\n title = \t {Approximately Equivariant Networks for Imperfectly Symmetric Dynamics},\n author = {Wang, Rui and Walters, Robin and Yu, Rose},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23078--23091},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22aa/wang22aa.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22aa.html},\n abstract = \t {Incorporating symmetry as an inductive bias into neural network architecture has led to improvements in generalization, data efficiency, and physical consistency in dynamics modeling. Methods such as CNNs or equivariant neural networks use weight tying to enforce symmetries such as shift invariance or rotational equivariance. However, despite the fact that physical laws obey many symmetries, real-world dynamical data rarely conforms to strict mathematical symmetry either due to noisy or incomplete data or to symmetry breaking features in the underlying dynamical system. We explore approximately equivariant networks which are biased towards preserving symmetry but are not strictly constrained to do so. By relaxing equivariance constraints, we find that our models can outperform both baselines with no symmetry bias and baselines with overly strict symmetry in both simulated turbulence domains and real-world multi-stream jet flow.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22aa/wang22aa.pdf", "supp": "", "pdf_size": 5256066, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5872423159806810171&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "University of California San Diego; Northeastern University; University of California San Diego", "aff_domain": "ucsd.edu; ; ", "email": "ucsd.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22aa.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, San Diego;Northeastern University", "aff_unique_dep": ";", "aff_unique_url": "https://ucsd.edu;https://www.northeastern.edu", "aff_unique_abbr": "UCSD;NEU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Architecture Agnostic Federated Learning for Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16925", "id": "16925", "proceeding": "https://proceedings.mlr.press/v162/makhija22a.html", "poster": "", "slides": "", "author_site": "Disha Makhija, Xing Han, Nhat Ho, Joydeep Ghosh", "author": "Disha Makhija; Xing Han; Nhat Ho; Joydeep Ghosh", "abstract": "With growing concerns regarding data privacy and rapid increase in data volume, Federated Learning (FL) has become an important learning paradigm. However, jointly learning a deep neural network model in a FL setting proves to be a non-trivial task because of the complexities associated with the neural networks, such as varied architectures across clients, permutation invariance of the neurons, and presence of non-linear transformations in each layer. This work introduces a novel framework, Federated Heterogeneous Neural Networks (FedHeNN), that allows each client to build a personalised model without enforcing a common architecture across clients. This allows each client to optimize with respect to local data and compute constraints, while still benefiting from the learnings of other (potentially more powerful) clients. The key idea of FedHeNN is to use the instance-level representations obtained from peer clients to guide the simultaneous training on each client. The extensive experimental results demonstrate that the FedHeNN framework is capable of learning better performing models on clients in both the settings of homogeneous and heterogeneous architectures across clients.", "bibtex": "@InProceedings{pmlr-v162-makhija22a,\n title = \t {Architecture Agnostic Federated Learning for Neural Networks},\n author = {Makhija, Disha and Han, Xing and Ho, Nhat and Ghosh, Joydeep},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14860--14870},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/makhija22a/makhija22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/makhija22a.html},\n abstract = \t {With growing concerns regarding data privacy and rapid increase in data volume, Federated Learning (FL) has become an important learning paradigm. However, jointly learning a deep neural network model in a FL setting proves to be a non-trivial task because of the complexities associated with the neural networks, such as varied architectures across clients, permutation invariance of the neurons, and presence of non-linear transformations in each layer. This work introduces a novel framework, Federated Heterogeneous Neural Networks (FedHeNN), that allows each client to build a personalised model without enforcing a common architecture across clients. This allows each client to optimize with respect to local data and compute constraints, while still benefiting from the learnings of other (potentially more powerful) clients. The key idea of FedHeNN is to use the instance-level representations obtained from peer clients to guide the simultaneous training on each client. The extensive experimental results demonstrate that the FedHeNN framework is capable of learning better performing models on clients in both the settings of homogeneous and heterogeneous architectures across clients.}\n}", "pdf": "https://proceedings.mlr.press/v162/makhija22a/makhija22a.pdf", "supp": "", "pdf_size": 308537, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=385492613340669454&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "The University of Texas at Austin; The University of Texas at Austin; The University of Texas at Austin; The University of Texas at Austin", "aff_domain": "utexas.edu; ; ; ", "email": "utexas.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/makhija22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Asking for Knowledge (AFK): Training RL Agents to Query External Knowledge Using Language", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17095", "id": "17095", "proceeding": "https://proceedings.mlr.press/v162/liu22t.html", "poster": "/media/PosterPDFs/ICML%202022/512fc3c5227f637e41437c999a2d3169.png?t=1657574867.1616566", "slides": "", "author_site": "Iou-Jen Liu, Xingdi Yuan, Marc-Alexandre C\u00f4t\u00e9, Pierre-Yves Oudeyer, Alex Schwing", "author": "Iou-Jen Liu; Xingdi Yuan; Marc-Alexandre C\u00f4t\u00e9; Pierre-Yves Oudeyer; Alexander Schwing", "abstract": "To solve difficult tasks, humans ask questions to acquire knowledge from external sources. In contrast, classical reinforcement learning agents lack such an ability and often resort to exploratory behavior. This is exacerbated as few present-day environments support querying for knowledge. In order to study how agents can be taught to query external knowledge via language, we first introduce two new environments: the grid-world-based Q-BabyAI and the text-based Q-TextWorld. In addition to physical interactions, an agent can query an external knowledge source specialized for these environments to gather information. Second, we propose the \u2018Asking for Knowledge\u2019 (AFK) agent, which learns to generate language commands to query for meaningful knowledge that helps solve the tasks. AFK leverages a non-parametric memory, a pointer mechanism and an episodic exploration bonus to tackle (1) irrelevant information, (2) a large query language space, (3) delayed reward for making meaningful queries. Extensive experiments demonstrate that the AFK agent outperforms recent baselines on the challenging Q-BabyAI and Q-TextWorld environments.", "bibtex": "@InProceedings{pmlr-v162-liu22t,\n title = \t {Asking for Knowledge ({AFK}): Training {RL} Agents to Query External Knowledge Using Language},\n author = {Liu, Iou-Jen and Yuan, Xingdi and C{\\^o}t{\\'e}, Marc-Alexandre and Oudeyer, Pierre-Yves and Schwing, Alexander},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14073--14093},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22t/liu22t.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22t.html},\n abstract = \t {To solve difficult tasks, humans ask questions to acquire knowledge from external sources. In contrast, classical reinforcement learning agents lack such an ability and often resort to exploratory behavior. This is exacerbated as few present-day environments support querying for knowledge. In order to study how agents can be taught to query external knowledge via language, we first introduce two new environments: the grid-world-based Q-BabyAI and the text-based Q-TextWorld. In addition to physical interactions, an agent can query an external knowledge source specialized for these environments to gather information. Second, we propose the \u2018Asking for Knowledge\u2019 (AFK) agent, which learns to generate language commands to query for meaningful knowledge that helps solve the tasks. AFK leverages a non-parametric memory, a pointer mechanism and an episodic exploration bonus to tackle (1) irrelevant information, (2) a large query language space, (3) delayed reward for making meaningful queries. Extensive experiments demonstrate that the AFK agent outperforms recent baselines on the challenging Q-BabyAI and Q-TextWorld environments.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22t/liu22t.pdf", "supp": "", "pdf_size": 4266168, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11504720771775323385&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "University of Illinois at Urbana-Champaign, IL, U.S.A. + Microsoft Research, Montr\u00e9al, Canada; Microsoft Research, Montr\u00e9al, Canada; Microsoft Research, Montr\u00e9al, Canada + Inria, France; Inria, France; University of Illinois at Urbana-Champaign, IL, U.S.A.", "aff_domain": "illinois.edu;microsoft.com; ;inria.fr; ", "email": "illinois.edu;microsoft.com; ;inria.fr; ", "github": "https://ioujenliu.github.io/AFK", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/liu22t.html", "aff_unique_index": "0+1;1;1+2;2;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Microsoft;INRIA", "aff_unique_dep": ";Microsoft Research;", "aff_unique_url": "https://illinois.edu;https://www.microsoft.com/en-us/research/group/microsoft-research-montreal;https://www.inria.fr", "aff_unique_abbr": "UIUC;MSR;Inria", "aff_campus_unique_index": "0+1;1;1;0", "aff_campus_unique": "Urbana-Champaign;Montr\u00e9al;", "aff_country_unique_index": "0+1;1;1+2;2;0", "aff_country_unique": "United States;Canada;France" }, { "title": "Asymptotically-Optimal Gaussian Bandits with Side Observations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17235", "id": "17235", "proceeding": "https://proceedings.mlr.press/v162/atsidakou22a.html", "poster": "/media/PosterPDFs/ICML%202022/239f914f30ea3c948fce2ea07a9efb33.png?t=1657383972.0056264", "slides": "", "author_site": "Alexia Atsidakou, Orestis Papadigenopoulos, Constantine Caramanis, Sujay Sanghavi, Sanjay Shakkottai", "author": "Alexia Atsidakou; Orestis Papadigenopoulos; Constantine Caramanis; Sujay Sanghavi; Sanjay Shakkottai", "abstract": "We study the problem of Gaussian bandits with general side information, as first introduced by Wu, Szepesv\u00e1ri, and Gy\u00f6rgy. In this setting, the play of an arm reveals information about other arms, according to an arbitrary", "bibtex": "@InProceedings{pmlr-v162-atsidakou22a,\n title = \t {Asymptotically-Optimal {G}aussian Bandits with Side Observations},\n author = {Atsidakou, Alexia and Papadigenopoulos, Orestis and Caramanis, Constantine and Sanghavi, Sujay and Shakkottai, Sanjay},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1057--1077},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/atsidakou22a/atsidakou22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/atsidakou22a.html},\n abstract = \t {We study the problem of Gaussian bandits with general side information, as first introduced by Wu, Szepesv\u00e1ri, and Gy\u00f6rgy. In this setting, the play of an arm reveals information about other arms, according to an arbitrary", "pdf": "https://proceedings.mlr.press/v162/atsidakou22a/atsidakou22a.pdf", "supp": "", "pdf_size": 426856, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=107750334685274843&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Electrical and Computer Engineering, University of Texas at Austin; Department of Computer Science, University of Texas at Austin; Department of Electrical and Computer Engineering, University of Texas at Austin + Amazon Science; Department of Electrical and Computer Engineering, University of Texas at Austin; Department of Electrical and Computer Engineering, University of Texas at Austin", "aff_domain": "utexas.edu;cs.utexas.edu; ; ; ", "email": "utexas.edu;cs.utexas.edu; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/atsidakou22a.html", "aff_unique_index": "0;0;0+1;0;0", "aff_unique_norm": "University of Texas at Austin;Amazon", "aff_unique_dep": "Department of Electrical and Computer Engineering;Amazon Science", "aff_unique_url": "https://www.utexas.edu;https://www.amazon.science", "aff_unique_abbr": "UT Austin;Amazon Science", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "Attentional Meta-learners for Few-shot Polythetic Classification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16067", "id": "16067", "proceeding": "https://proceedings.mlr.press/v162/day22a.html", "poster": "/media/PosterPDFs/ICML%202022/f7b6bc883be91f56eb248d72de4d2847.png?t=1657187311.8535109", "slides": "", "author_site": "Ben Day, Ramon Vi\u00f1as Torn\u00e9, Nikola Simidjievski, Pietro Li\u00f3", "author": "Ben J Day; Ramon Vi\u00f1as Torn\u00e9; Nikola Simidjievski; Pietro Li\u00f3", "abstract": "Polythetic classifications, based on shared patterns of features that need neither be universal nor constant among members of a class, are common in the natural world and greatly outnumber monothetic classifications over a set of features. We show that threshold meta-learners, such as Prototypical Networks, require an embedding dimension that is exponential in the number of task-relevant features to emulate these functions. In contrast, attentional classifiers, such as Matching Networks, are polythetic by default and able to solve these problems with a linear embedding dimension. However, we find that in the presence of task-irrelevant features, inherent to meta-learning problems, attentional models are susceptible to misclassification. To address this challenge, we propose a self-attention feature-selection mechanism that adaptively dilutes non-discriminative features. We demonstrate the effectiveness of our approach in meta-learning Boolean functions, and synthetic and real-world few-shot learning tasks.", "bibtex": "@InProceedings{pmlr-v162-day22a,\n title = \t {Attentional Meta-learners for Few-shot Polythetic Classification},\n author = {Day, Ben J and Torn{\\'e}, Ramon Vi{\\~n}as and Simidjievski, Nikola and Li{\\'o}, Pietro},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4867--4889},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/day22a/day22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/day22a.html},\n abstract = \t {Polythetic classifications, based on shared patterns of features that need neither be universal nor constant among members of a class, are common in the natural world and greatly outnumber monothetic classifications over a set of features. We show that threshold meta-learners, such as Prototypical Networks, require an embedding dimension that is exponential in the number of task-relevant features to emulate these functions. In contrast, attentional classifiers, such as Matching Networks, are polythetic by default and able to solve these problems with a linear embedding dimension. However, we find that in the presence of task-irrelevant features, inherent to meta-learning problems, attentional models are susceptible to misclassification. To address this challenge, we propose a self-attention feature-selection mechanism that adaptively dilutes non-discriminative features. We demonstrate the effectiveness of our approach in meta-learning Boolean functions, and synthetic and real-world few-shot learning tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/day22a/day22a.pdf", "supp": "", "pdf_size": 4334801, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5360824455580624680&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science and Technology, University of Cambridge, Cambridge, UK; Department of Computer Science and Technology, University of Cambridge, Cambridge, UK; Department of Computer Science and Technology, University of Cambridge, Cambridge, UK; Department of Computer Science and Technology, University of Cambridge, Cambridge, UK", "aff_domain": "cam.ac.uk.edu;cam.ac.uk.edu; ; ", "email": "cam.ac.uk.edu;cam.ac.uk.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/day22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "Department of Computer Science and Technology", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Augment with Care: Contrastive Learning for Combinatorial Problems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16739", "id": "16739", "proceeding": "https://proceedings.mlr.press/v162/duan22b.html", "poster": "/media/PosterPDFs/ICML%202022/fef6f971605336724b5e6c0c12dc2534.png?t=1657919671.197083", "slides": "", "author_site": "Haonan Duan, Pashootan Vaezipoor, Max Paulus, Yangjun Ruan, Chris Maddison", "author": "Haonan Duan; Pashootan Vaezipoor; Max B Paulus; Yangjun Ruan; Chris Maddison", "abstract": "Supervised learning can improve the design of state-of-the-art solvers for combinatorial problems, but labelling large numbers of combinatorial instances is often impractical due to exponential worst-case complexity. Inspired by the recent success of contrastive pre-training for images, we conduct a scientific study of the effect of augmentation design on contrastive pre-training for the Boolean satisfiability problem. While typical graph contrastive pre-training uses label-agnostic augmentations, our key insight is that many combinatorial problems have well-studied invariances, which allow for the design of label-preserving augmentations. We find that label-preserving augmentations are critical for the success of contrastive pre-training. We show that our representations are able to achieve comparable test accuracy to fully-supervised learning while using only 1% of the labels. We also demonstrate that our representations are more transferable to larger problems from unseen domains. Our code is available at https://github.com/h4duan/contrastive-sat.", "bibtex": "@InProceedings{pmlr-v162-duan22b,\n title = \t {Augment with Care: Contrastive Learning for Combinatorial Problems},\n author = {Duan, Haonan and Vaezipoor, Pashootan and Paulus, Max B and Ruan, Yangjun and Maddison, Chris},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5627--5642},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/duan22b/duan22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/duan22b.html},\n abstract = \t {Supervised learning can improve the design of state-of-the-art solvers for combinatorial problems, but labelling large numbers of combinatorial instances is often impractical due to exponential worst-case complexity. Inspired by the recent success of contrastive pre-training for images, we conduct a scientific study of the effect of augmentation design on contrastive pre-training for the Boolean satisfiability problem. While typical graph contrastive pre-training uses label-agnostic augmentations, our key insight is that many combinatorial problems have well-studied invariances, which allow for the design of label-preserving augmentations. We find that label-preserving augmentations are critical for the success of contrastive pre-training. We show that our representations are able to achieve comparable test accuracy to fully-supervised learning while using only 1% of the labels. We also demonstrate that our representations are more transferable to larger problems from unseen domains. Our code is available at https://github.com/h4duan/contrastive-sat.}\n}", "pdf": "https://proceedings.mlr.press/v162/duan22b/duan22b.pdf", "supp": "", "pdf_size": 3859141, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9493161688768381207&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "University of Toronto + Vector Institute; University of Toronto + Vector Institute; ETH Z\u00fcrich; University of Toronto + Vector Institute; University of Toronto + Vector Institute", "aff_domain": "cs.toronto.edu; ;inf.ethz.ch; ; ", "email": "cs.toronto.edu; ;inf.ethz.ch; ; ", "github": "https://github.com/h4duan/contrastive-sat", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/duan22b.html", "aff_unique_index": "0+1;0+1;2;0+1;0+1", "aff_unique_norm": "University of Toronto;Vector Institute;ETH Zurich", "aff_unique_dep": ";;", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/;https://www.ethz.ch", "aff_unique_abbr": "U of T;Vector Institute;ETHZ", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;1;0+0;0+0", "aff_country_unique": "Canada;Switzerland" }, { "title": "AutoIP: A United Framework to Integrate Physics into Gaussian Processes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16201", "id": "16201", "proceeding": "https://proceedings.mlr.press/v162/long22a.html", "poster": "/media/PosterPDFs/ICML%202022/236f119f58f5fd102c5a2ca609fdcbd8.png?t=1657224326.8489306", "slides": "", "author_site": "Da Long, Zheng Wang, Aditi Krishnapriyan, Robert Kirby, Shandian Zhe, Michael Mahoney", "author": "Da Long; Zheng Wang; Aditi Krishnapriyan; Robert Kirby; Shandian Zhe; Michael Mahoney", "abstract": "Physical modeling is critical for many modern science and engineering applications. From a data science or machine learning perspective, where more domain-agnostic, data-driven models are pervasive, physical knowledge {\u2014} often expressed as differential equations {\u2014} is valuable in that it is complementary to data, and it can potentially help overcome issues such as data sparsity, noise, and inaccuracy. In this work, we propose a simple, yet powerful and general framework {\u2014} AutoIP, for Automatically Incorporating Physics {\u2014} that can integrate all kinds of differential equations into Gaussian Processes (GPs) to enhance prediction accuracy and uncertainty quantification. These equations can be linear or nonlinear, spatial, temporal, or spatio-temporal, complete or incomplete with unknown source terms, and so on. Based on kernel differentiation, we construct a GP prior to sample the values of the target function, equation related derivatives, and latent source functions, which are all jointly from a multivariate Gaussian distribution. The sampled values are fed to two likelihoods: one to fit the observations, and the other to conform to the equation. We use the whitening method to evade the strong dependency between the sampled function values and kernel parameters, and we develop a stochastic variational learning algorithm. AutoIP shows improvement upon vanilla GPs in both simulation and several real-world applications, even using rough, incomplete equations.", "bibtex": "@InProceedings{pmlr-v162-long22a,\n title = \t {{A}uto{IP}: A United Framework to Integrate Physics into {G}aussian Processes},\n author = {Long, Da and Wang, Zheng and Krishnapriyan, Aditi and Kirby, Robert and Zhe, Shandian and Mahoney, Michael},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14210--14222},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/long22a/long22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/long22a.html},\n abstract = \t {Physical modeling is critical for many modern science and engineering applications. From a data science or machine learning perspective, where more domain-agnostic, data-driven models are pervasive, physical knowledge {\u2014} often expressed as differential equations {\u2014} is valuable in that it is complementary to data, and it can potentially help overcome issues such as data sparsity, noise, and inaccuracy. In this work, we propose a simple, yet powerful and general framework {\u2014} AutoIP, for Automatically Incorporating Physics {\u2014} that can integrate all kinds of differential equations into Gaussian Processes (GPs) to enhance prediction accuracy and uncertainty quantification. These equations can be linear or nonlinear, spatial, temporal, or spatio-temporal, complete or incomplete with unknown source terms, and so on. Based on kernel differentiation, we construct a GP prior to sample the values of the target function, equation related derivatives, and latent source functions, which are all jointly from a multivariate Gaussian distribution. The sampled values are fed to two likelihoods: one to fit the observations, and the other to conform to the equation. We use the whitening method to evade the strong dependency between the sampled function values and kernel parameters, and we develop a stochastic variational learning algorithm. AutoIP shows improvement upon vanilla GPs in both simulation and several real-world applications, even using rough, incomplete equations.}\n}", "pdf": "https://proceedings.mlr.press/v162/long22a/long22a.pdf", "supp": "", "pdf_size": 866305, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8867104467661921619&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Utah; University of Utah; University of California, Berkeley + Lawrence Berkeley National Laboratory; University of Utah; University of Utah; University of California, Berkeley + Lawrence Berkeley National Laboratory + International Computer Science Institute", "aff_domain": "cs.utah.edu; ; ; ;cs.utah.edu; ", "email": "cs.utah.edu; ; ; ;cs.utah.edu; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/long22a.html", "aff_unique_index": "0;0;1+2;0;0;1+2+3", "aff_unique_norm": "University of Utah;University of California, Berkeley;Lawrence Berkeley National Laboratory;International Computer Science Institute", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.utah.edu;https://www.berkeley.edu;https://www.lbl.gov;https://www.icsi.berkeley.edu/", "aff_unique_abbr": "Utah;UC Berkeley;LBNL;ICSI", "aff_campus_unique_index": "1+1;1+1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0+0;0;0;0+0+0", "aff_country_unique": "United States" }, { "title": "AutoSNN: Towards Energy-Efficient Spiking Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17755", "id": "17755", "proceeding": "https://proceedings.mlr.press/v162/na22a.html", "poster": "/media/PosterPDFs/ICML%202022/e2c420d928d4bf8ce0ff2ec19b371514_J7dnpQw.png?t=1657173946.8302994", "slides": "", "author_site": "Byunggook Na, Jisoo Mok, Seongsik Park, Dongjin Lee, Hyeokjun Choe, Sungroh Yoon", "author": "Byunggook Na; Jisoo Mok; Seongsik Park; Dongjin Lee; Hyeokjun Choe; Sungroh Yoon", "abstract": "Spiking neural networks (SNNs) that mimic information transmission in the brain can energy-efficiently process spatio-temporal information through discrete and sparse spikes, thereby receiving considerable attention. To improve accuracy and energy efficiency of SNNs, most previous studies have focused solely on training methods, and the effect of architecture has rarely been studied. We investigate the design choices used in the previous studies in terms of the accuracy and number of spikes and figure out that they are not best-suited for SNNs. To further improve the accuracy and reduce the spikes generated by SNNs, we propose a spike-aware neural architecture search framework called AutoSNN. We define a search space consisting of architectures without undesirable design choices. To enable the spike-aware architecture search, we introduce a fitness that considers both the accuracy and number of spikes. AutoSNN successfully searches for SNN architectures that outperform hand-crafted SNNs in accuracy and energy efficiency. We thoroughly demonstrate the effectiveness of AutoSNN on various datasets including neuromorphic datasets.", "bibtex": "@InProceedings{pmlr-v162-na22a,\n title = \t {{A}uto{SNN}: Towards Energy-Efficient Spiking Neural Networks},\n author = {Na, Byunggook and Mok, Jisoo and Park, Seongsik and Lee, Dongjin and Choe, Hyeokjun and Yoon, Sungroh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16253--16269},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/na22a/na22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/na22a.html},\n abstract = \t {Spiking neural networks (SNNs) that mimic information transmission in the brain can energy-efficiently process spatio-temporal information through discrete and sparse spikes, thereby receiving considerable attention. To improve accuracy and energy efficiency of SNNs, most previous studies have focused solely on training methods, and the effect of architecture has rarely been studied. We investigate the design choices used in the previous studies in terms of the accuracy and number of spikes and figure out that they are not best-suited for SNNs. To further improve the accuracy and reduce the spikes generated by SNNs, we propose a spike-aware neural architecture search framework called AutoSNN. We define a search space consisting of architectures without undesirable design choices. To enable the spike-aware architecture search, we introduce a fitness that considers both the accuracy and number of spikes. AutoSNN successfully searches for SNN architectures that outperform hand-crafted SNNs in accuracy and energy efficiency. We thoroughly demonstrate the effectiveness of AutoSNN on various datasets including neuromorphic datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/na22a/na22a.pdf", "supp": "", "pdf_size": 2080857, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4509781886252984486&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Samsung Advanced Institute of Technology, South Korea; Department of Electric and Computer Engineering, Seoul National University, South Korea; Korea Institute of Science and Technology, South Korea; Department of Electric and Computer Engineering, Seoul National University, South Korea; Department of Electric and Computer Engineering, Seoul National University, South Korea; Interdisciplinary Program in Artificial Intelligence, Seoul National University, South Korea", "aff_domain": "snu.ac.kr; ; ; ; ;snu.ac.kr", "email": "snu.ac.kr; ; ; ; ;snu.ac.kr", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/na22a.html", "aff_unique_index": "0;1;2;1;1;1", "aff_unique_norm": "Samsung;Seoul National University;Korea Institute of Science and Technology", "aff_unique_dep": "Samsung Advanced Institute of Technology;Department of Electric and Computer Engineering;", "aff_unique_url": "https://www.sait.samsung.com;https://www.snu.ac.kr;https://www.kist.re.kr", "aff_unique_abbr": "SAIT;SNU;KIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Auxiliary Learning with Joint Task and Data Scheduling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17291", "id": "17291", "proceeding": "https://proceedings.mlr.press/v162/chen22y.html", "poster": "/media/PosterPDFs/ICML%202022/7f1171a78ce0780a2142a6eb7bc4f3c8.png?t=1657195769.9752038", "slides": "", "author_site": "Hong Chen, Xin Wang, Chaoyu Guan, Yue Liu, Wenwu Zhu", "author": "Hong Chen; Xin Wang; Chaoyu Guan; Yue Liu; Wenwu Zhu", "abstract": "Existing auxiliary learning approaches only consider the relationships between the target task and the auxiliary tasks, ignoring the fact that data samples within an auxiliary task could contribute differently to the target task, which results in inefficient auxiliary information usage and non-robustness to data noise. In this paper, we propose to learn a joint task and data schedule for auxiliary learning, which captures the importance of different data samples in each auxiliary task to the target task. However, learning such a joint schedule is challenging due to the large number of additional parameters required for the schedule. To tackle the challenge, we propose a joint task and data scheduling (JTDS) model for auxiliary learning. The JTDS model captures the joint task-data importance through a task-data scheduler, which creates a mapping from task, feature and label information to the schedule in a parameter-efficient way. Particularly, we formulate the scheduler and the task learning process as a bi-level optimization problem. In the lower optimization, the task learning model is updated with the scheduled gradient, while in the upper optimization, the task-data scheduler is updated with the implicit gradient. Experimental results show that our JTDS model significantly outperforms the state-of-the-art methods under supervised, semi-supervised and corrupted label settings.", "bibtex": "@InProceedings{pmlr-v162-chen22y,\n title = \t {Auxiliary Learning with Joint Task and Data Scheduling},\n author = {Chen, Hong and Wang, Xin and Guan, Chaoyu and Liu, Yue and Zhu, Wenwu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3634--3647},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22y/chen22y.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22y.html},\n abstract = \t {Existing auxiliary learning approaches only consider the relationships between the target task and the auxiliary tasks, ignoring the fact that data samples within an auxiliary task could contribute differently to the target task, which results in inefficient auxiliary information usage and non-robustness to data noise. In this paper, we propose to learn a joint task and data schedule for auxiliary learning, which captures the importance of different data samples in each auxiliary task to the target task. However, learning such a joint schedule is challenging due to the large number of additional parameters required for the schedule. To tackle the challenge, we propose a joint task and data scheduling (JTDS) model for auxiliary learning. The JTDS model captures the joint task-data importance through a task-data scheduler, which creates a mapping from task, feature and label information to the schedule in a parameter-efficient way. Particularly, we formulate the scheduler and the task learning process as a bi-level optimization problem. In the lower optimization, the task learning model is updated with the scheduled gradient, while in the upper optimization, the task-data scheduler is updated with the implicit gradient. Experimental results show that our JTDS model significantly outperforms the state-of-the-art methods under supervised, semi-supervised and corrupted label settings.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22y/chen22y.pdf", "supp": "", "pdf_size": 4889319, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12031181081607035754&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science and Technology, Tsinghua University; Department of Computer Science and Technology, Tsinghua University + THU-Bosch JCML center, Tsinghua University; Department of Computer Science and Technology, Tsinghua University; Department of Computer Science and Technology, Tsinghua University; Department of Computer Science and Technology, Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "github": "https://github.com/forchchch/JTDS", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/chen22y.html", "aff_unique_index": "0;0+0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Department of Computer Science and Technology", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0;0", "aff_country_unique": "China" }, { "title": "BAMDT: Bayesian Additive Semi-Multivariate Decision Trees for Nonparametric Regression", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16755", "id": "16755", "proceeding": "https://proceedings.mlr.press/v162/luo22a.html", "poster": "/media/PosterPDFs/ICML%202022/1f9f9d8ff75205aa73ec83e543d8b571.png?t=1657166977.3969903", "slides": "", "author_site": "Zhao Tang Luo, Huiyan Sang, Bani Mallick", "author": "Zhao Tang Luo; Huiyan Sang; Bani Mallick", "abstract": "Bayesian additive regression trees (BART; Chipman et al., 2010) have gained great popularity as a flexible nonparametric function estimation and modeling tool. Nearly all existing BART models rely on decision tree weak learners with axis-parallel univariate split rules to partition the Euclidean feature space into rectangular regions. In practice, however, many regression problems involve features with multivariate structures (e.g., spatial locations) possibly lying in a manifold, where rectangular partitions may fail to respect irregular intrinsic geometry and boundary constraints of the structured feature space. In this paper, we develop a new class of Bayesian additive multivariate decision tree models that combine univariate split rules for handling possibly high dimensional features without known multivariate structures and novel multivariate split rules for features with multivariate structures in each weak learner. The proposed multivariate split rules are built upon stochastic predictive spanning tree bipartition models on reference knots, which are capable of achieving highly flexible nonlinear decision boundaries on manifold feature spaces while enabling efficient dimension reduction computations. We demonstrate the superior performance of the proposed method using simulation data and a Sacramento housing price data set.", "bibtex": "@InProceedings{pmlr-v162-luo22a,\n title = \t {{BAMDT}: {B}ayesian Additive Semi-Multivariate Decision Trees for Nonparametric Regression},\n author = {Luo, Zhao Tang and Sang, Huiyan and Mallick, Bani},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14509--14526},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/luo22a/luo22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/luo22a.html},\n abstract = \t {Bayesian additive regression trees (BART; Chipman et al., 2010) have gained great popularity as a flexible nonparametric function estimation and modeling tool. Nearly all existing BART models rely on decision tree weak learners with axis-parallel univariate split rules to partition the Euclidean feature space into rectangular regions. In practice, however, many regression problems involve features with multivariate structures (e.g., spatial locations) possibly lying in a manifold, where rectangular partitions may fail to respect irregular intrinsic geometry and boundary constraints of the structured feature space. In this paper, we develop a new class of Bayesian additive multivariate decision tree models that combine univariate split rules for handling possibly high dimensional features without known multivariate structures and novel multivariate split rules for features with multivariate structures in each weak learner. The proposed multivariate split rules are built upon stochastic predictive spanning tree bipartition models on reference knots, which are capable of achieving highly flexible nonlinear decision boundaries on manifold feature spaces while enabling efficient dimension reduction computations. We demonstrate the superior performance of the proposed method using simulation data and a Sacramento housing price data set.}\n}", "pdf": "https://proceedings.mlr.press/v162/luo22a/luo22a.pdf", "supp": "", "pdf_size": 1891081, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14076635556650013414&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Statistics, Texas A&M University, College Station, TX, USA; Department of Statistics, Texas A&M University, College Station, TX, USA; Department of Statistics, Texas A&M University, College Station, TX, USA", "aff_domain": "gmail.com;stat.tamu.edu; ", "email": "gmail.com;stat.tamu.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/luo22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Station", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16015", "id": "16015", "proceeding": "https://proceedings.mlr.press/v162/li22n.html", "poster": "/media/PosterPDFs/ICML%202022/f74909ace68e51891440e4da0b65a70c_9pWkDJi.png?t=1657252695.5541654", "slides": "", "author_site": "Junnan Li, DONGXU LI, Caiming Xiong, Steven Hoi", "author": "Junnan Li; Dongxu Li; Caiming Xiong; Steven Hoi", "abstract": "Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks. However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to video-language tasks in a zero-shot manner. Code and models are available at https://github.com/salesforce/BLIP.", "bibtex": "@InProceedings{pmlr-v162-li22n,\n title = \t {{BLIP}: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},\n author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12888--12900},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22n/li22n.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22n.html},\n abstract = \t {Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks. However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to video-language tasks in a zero-shot manner. Code and models are available at https://github.com/salesforce/BLIP.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22n/li22n.pdf", "supp": "", "pdf_size": 1235427, "gs_citation": 5196, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7770442917120891581&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": "Salesforce Research; Salesforce Research; Salesforce Research; Salesforce Research", "aff_domain": "salesforce.com; ; ; ", "email": "salesforce.com; ; ; ", "github": "https://github.com/salesforce/BLIP", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/li22n.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Salesforce", "aff_unique_dep": "Salesforce Research", "aff_unique_url": "https://research.salesforce.com", "aff_unique_abbr": "Salesforce", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "BabelTower: Learning to Auto-parallelized Program Translation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18237", "id": "18237", "proceeding": "https://proceedings.mlr.press/v162/wen22b.html", "poster": "/media/PosterPDFs/ICML%202022/e7b24b112a44fdd9ee93bdf998c6ca0e.png?t=1657207143.8653538", "slides": "", "author_site": "Yuanbo Wen, Qi Guo, Qiang Fu, XiaQing Li, jianxing xu, Yanlin Tang, Yongwei Zhao, Xing Hu, Zidong Du, Ling Li, Chao Wang, Xuehai Zhou, Yunji Chen", "author": "Yuanbo Wen; Qi Guo; Qiang Fu; Xiaqing Li; Jianxing Xu; Yanlin Tang; Yongwei Zhao; Xing Hu; Zidong Du; Ling Li; Chao Wang; Xuehai Zhou; Yunji Chen", "abstract": "GPUs have become the dominant computing platforms for many applications, while programming GPUs with the widely-used CUDA parallel programming model is difficult. As sequential C code is relatively easy to obtain either from legacy repositories or by manual implementation, automatically translating C to its parallel CUDA counterpart is promising to relieve the burden of GPU programming. However, because of huge differences between the sequential C and the parallel CUDA programming model, existing approaches fail to conduct the challenging auto-parallelized program translation. In this paper, we propose a learning-based framework, i.e., BabelTower, to address this problem. We first create a large-scale dataset consisting of compute-intensive function-level monolingual corpora. We further propose using back-translation with a discriminative reranker to cope with unpaired corpora and parallel semantic conversion. Experimental results show that BabelTower outperforms state-of-the-art by 1.79, 6.09, and 9.39 in terms of BLEU, CodeBLEU, and specifically designed ParaBLEU, respectively. The CUDA code generated by BabelTower attains a speedup of up to 347x over the sequential C code, and the developer productivity is improved by at most 3.8x.", "bibtex": "@InProceedings{pmlr-v162-wen22b,\n title = \t {{B}abel{T}ower: Learning to Auto-parallelized Program Translation},\n author = {Wen, Yuanbo and Guo, Qi and Fu, Qiang and Li, Xiaqing and Xu, Jianxing and Tang, Yanlin and Zhao, Yongwei and Hu, Xing and Du, Zidong and Li, Ling and Wang, Chao and Zhou, Xuehai and Chen, Yunji},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23685--23700},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wen22b/wen22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/wen22b.html},\n abstract = \t {GPUs have become the dominant computing platforms for many applications, while programming GPUs with the widely-used CUDA parallel programming model is difficult. As sequential C code is relatively easy to obtain either from legacy repositories or by manual implementation, automatically translating C to its parallel CUDA counterpart is promising to relieve the burden of GPU programming. However, because of huge differences between the sequential C and the parallel CUDA programming model, existing approaches fail to conduct the challenging auto-parallelized program translation. In this paper, we propose a learning-based framework, i.e., BabelTower, to address this problem. We first create a large-scale dataset consisting of compute-intensive function-level monolingual corpora. We further propose using back-translation with a discriminative reranker to cope with unpaired corpora and parallel semantic conversion. Experimental results show that BabelTower outperforms state-of-the-art by 1.79, 6.09, and 9.39 in terms of BLEU, CodeBLEU, and specifically designed ParaBLEU, respectively. The CUDA code generated by BabelTower attains a speedup of up to 347x over the sequential C code, and the developer productivity is improved by at most 3.8x.}\n}", "pdf": "https://proceedings.mlr.press/v162/wen22b/wen22b.pdf", "supp": "", "pdf_size": 789986, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8790679224559684782&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "University of Science and Technology of China+State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences+Cambricon Technologies, Beijing, China; State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences+University of Chinese Academy of Sciences+Cambricon Technologies, Beijing, China; University of Science and Technology of China+State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences+Cambricon Technologies, Beijing, China; State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences; University of Science and Technology of China+State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences+Cambricon Technologies, Beijing, China; State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences+University of Chinese Academy of Sciences+Cambricon Technologies, Beijing, China; State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences+Cambricon Technologies, Beijing, China; State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences; State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences+Cambricon Technologies, Beijing, China; Institute of Software, Chinese Academy of Sciences; University of Science and Technology of China; University of Science and Technology of China; State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences+University of Chinese Academy of Sciences+Cambricon Technologies, Beijing, China", "aff_domain": "ustc.edu.cn;ict.ac.cn;ustc.edu.cn;ict.ac.cn;ustc.edu.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn;iscas.ac.cn;ustc.edu.cn;ustc.edu.cn;ict.ac.cn", "email": "ustc.edu.cn;ict.ac.cn;ustc.edu.cn;ict.ac.cn;ustc.edu.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn;ict.ac.cn;iscas.ac.cn;ustc.edu.cn;ustc.edu.cn;ict.ac.cn", "github": "", "project": "", "author_num": 13, "oa": "https://proceedings.mlr.press/v162/wen22b.html", "aff_unique_index": "0+1+2;1+3+2;0+1+2;1;0+1+2;1+3+2;1+2;1;1+2;1;0;0;1+3+2", "aff_unique_norm": "University of Science and Technology of China;Chinese Academy of Sciences;Cambricon Technologies;University of Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Computing Technology;;", "aff_unique_url": "http://www.ustc.edu.cn;http://www.ict.ac.cn;https://www.cambricon.com;http://www.ucas.ac.cn", "aff_unique_abbr": "USTC;CAS;;UCAS", "aff_campus_unique_index": "1;1;1;1;1;1;1;1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0;0+0+0;0+0+0;0+0;0;0+0;0;0;0;0+0+0", "aff_country_unique": "China" }, { "title": "Balancing Discriminability and Transferability for Source-Free Domain Adaptation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17779", "id": "17779", "proceeding": "https://proceedings.mlr.press/v162/kundu22a.html", "poster": "/media/PosterPDFs/ICML%202022/6da37dd3139aa4d9aa55b8d237ec5d4a.png?t=1657687267.4272425", "slides": "", "author_site": "Jogendra Nath Kundu, Akshay Kulkarni, Suvaansh Bhambri, Deepesh Mehta, Shreyas Kulkarni, Varun Jampani, Venkatesh Babu Radhakrishnan", "author": "Jogendra Nath Kundu; Akshay R Kulkarni; Suvaansh Bhambri; Deepesh Mehta; Shreyas Anand Kulkarni; Varun Jampani; Venkatesh Babu Radhakrishnan", "abstract": "Conventional domain adaptation (DA) techniques aim to improve domain transferability by learning domain-invariant representations; while concurrently preserving the task-discriminability knowledge gathered from the labeled source data. However, the requirement of simultaneous access to labeled source and unlabeled target renders them unsuitable for the challenging source-free DA setting. The trivial solution of realizing an effective original to generic domain mapping improves transferability but degrades task discriminability. Upon analyzing the hurdles from both theoretical and empirical standpoints, we derive novel insights to show that a mixup between original and corresponding translated generic samples enhances the discriminability-transferability trade-off while duly respecting the privacy-oriented source-free setting. A simple but effective realization of the proposed insights on top of the existing source-free DA approaches yields state-of-the-art performance with faster convergence. Beyond single-source, we also outperform multi-source prior-arts across both classification and semantic segmentation benchmarks.", "bibtex": "@InProceedings{pmlr-v162-kundu22a,\n title = \t {Balancing Discriminability and Transferability for Source-Free Domain Adaptation},\n author = {Kundu, Jogendra Nath and Kulkarni, Akshay R and Bhambri, Suvaansh and Mehta, Deepesh and Kulkarni, Shreyas Anand and Jampani, Varun and Radhakrishnan, Venkatesh Babu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11710--11728},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kundu22a/kundu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kundu22a.html},\n abstract = \t {Conventional domain adaptation (DA) techniques aim to improve domain transferability by learning domain-invariant representations; while concurrently preserving the task-discriminability knowledge gathered from the labeled source data. However, the requirement of simultaneous access to labeled source and unlabeled target renders them unsuitable for the challenging source-free DA setting. The trivial solution of realizing an effective original to generic domain mapping improves transferability but degrades task discriminability. Upon analyzing the hurdles from both theoretical and empirical standpoints, we derive novel insights to show that a mixup between original and corresponding translated generic samples enhances the discriminability-transferability trade-off while duly respecting the privacy-oriented source-free setting. A simple but effective realization of the proposed insights on top of the existing source-free DA approaches yields state-of-the-art performance with faster convergence. Beyond single-source, we also outperform multi-source prior-arts across both classification and semantic segmentation benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/kundu22a/kundu22a.pdf", "supp": "", "pdf_size": 3040425, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9320809919166954591&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Indian Institute of Science; Indian Institute of Science; Indian Institute of Science; Indian Institute of Science; Indian Institute of Science; Google Research; Indian Institute of Science", "aff_domain": "iisc.ac.in; ; ; ; ; ; ", "email": "iisc.ac.in; ; ; ; ; ; ", "github": "", "project": "https://sites.google.com/view/mixup-sfda", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/kundu22a.html", "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "Indian Institute of Science;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.iisc.ac.in;https://research.google", "aff_unique_abbr": "IISc;Google Research", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "India;United States" }, { "title": "Balancing Sample Efficiency and Suboptimality in Inverse Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16361", "id": "16361", "proceeding": "https://proceedings.mlr.press/v162/damiani22a.html", "poster": "/media/PosterPDFs/ICML%202022/9a6a1aaafe73c572b7374828b03a1881.png?t=1657206052.6200264", "slides": "", "author_site": "Angelo Damiani, Giorgio Manganini, Alberto Maria Metelli, Marcello Restelli", "author": "Angelo Damiani; Giorgio Manganini; Alberto Maria Metelli; Marcello Restelli", "abstract": "We propose a novel formulation for the Inverse Reinforcement Learning (IRL) problem, which jointly accounts for the compatibility with the expert behavior of the identified reward and its effectiveness for the subsequent forward learning phase. Albeit quite natural, especially when the final goal is apprenticeship learning (learning policies from an expert), this aspect has been completely overlooked by IRL approaches so far. We propose a new model-free IRL method that is remarkably able to autonomously find a trade-off between the error induced on the learned policy when potentially choosing a sub-optimal reward, and the estimation error caused by using finite samples in the forward learning phase, which can be controlled by explicitly optimizing also the discount factor of the related learning problem. The approach is based on a min-max formulation for the robust selection of the reward parameters and the discount factor so that the distance between the expert\u2019s policy and the learned policy is minimized in the successive forward learning task when a finite and possibly small number of samples is available. Differently from the majority of other IRL techniques, our approach does not involve any planning or forward Reinforcement Learning problems to be solved. After presenting the formulation, we provide a numerical scheme for the optimization, and we show its effectiveness on an illustrative numerical case.", "bibtex": "@InProceedings{pmlr-v162-damiani22a,\n title = \t {Balancing Sample Efficiency and Suboptimality in Inverse Reinforcement Learning},\n author = {Damiani, Angelo and Manganini, Giorgio and Metelli, Alberto Maria and Restelli, Marcello},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4618--4629},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/damiani22a/damiani22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/damiani22a.html},\n abstract = \t {We propose a novel formulation for the Inverse Reinforcement Learning (IRL) problem, which jointly accounts for the compatibility with the expert behavior of the identified reward and its effectiveness for the subsequent forward learning phase. Albeit quite natural, especially when the final goal is apprenticeship learning (learning policies from an expert), this aspect has been completely overlooked by IRL approaches so far. We propose a new model-free IRL method that is remarkably able to autonomously find a trade-off between the error induced on the learned policy when potentially choosing a sub-optimal reward, and the estimation error caused by using finite samples in the forward learning phase, which can be controlled by explicitly optimizing also the discount factor of the related learning problem. The approach is based on a min-max formulation for the robust selection of the reward parameters and the discount factor so that the distance between the expert\u2019s policy and the learned policy is minimized in the successive forward learning task when a finite and possibly small number of samples is available. Differently from the majority of other IRL techniques, our approach does not involve any planning or forward Reinforcement Learning problems to be solved. After presenting the formulation, we provide a numerical scheme for the optimization, and we show its effectiveness on an illustrative numerical case.}\n}", "pdf": "https://proceedings.mlr.press/v162/damiani22a/damiani22a.pdf", "supp": "", "pdf_size": 1326996, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12296934240855804327&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Gran Sasso Science Institute, L\u2019Aquila, Italy+Dipartimento di Elettronica, Informazione e Bioingegneria, Politecnico di Milano, Milano, Italy; Department of Computer Science, Gran Sasso Science Institute, L\u2019Aquila, Italy+Dipartimento di Elettronica, Informazione e Bioingegneria, Politecnico di Milano, Milano, Italy; Dipartimento di Elettronica, Informazione e Bioingegneria, Politecnico di Milano, Milano, Italy; Dipartimento di Elettronica, Informazione e Bioingegneria, Politecnico di Milano, Milano, Italy", "aff_domain": "gssi.it;gssi.it;polimi.it;polimi.it", "email": "gssi.it;gssi.it;polimi.it;polimi.it", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/damiani22a.html", "aff_unique_index": "0+1;0+1;1;1", "aff_unique_norm": "Gran Sasso Science Institute;Politecnico di Milano", "aff_unique_dep": "Department of Computer Science;Dipartimento di Elettronica, Informazione e Bioingegneria", "aff_unique_url": "https://www.gssi.it;https://www.polimi.it", "aff_unique_abbr": ";Politecnico di Milano", "aff_campus_unique_index": "0+1;0+1;1;1", "aff_campus_unique": "L\u2019Aquila;Milano", "aff_country_unique_index": "0+0;0+0;0;0", "aff_country_unique": "Italy" }, { "title": "Batch Greenkhorn Algorithm for Entropic-Regularized Multimarginal Optimal Transport: Linear Rate of Convergence and Iteration Complexity", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17939", "id": "17939", "proceeding": "https://proceedings.mlr.press/v162/kostic22a.html", "poster": "/media/PosterPDFs/ICML%202022/bc6d753857fe3dd4275dff707dedf329.png?t=1656486798.865261", "slides": "/media/icml-2022/Slides/17939_2CW6ltx.pdf", "author_site": "Vladimir Kostic, Saverio Salzo, Massimiliano Pontil", "author": "Vladimir R. Kostic; Saverio Salzo; Massimiliano Pontil", "abstract": "In this work we propose a batch multimarginal version of the Greenkhorn algorithm for the entropic-regularized optimal transport problem. This framework is general enough to cover, as particular cases, existing Sinkhorn and Greenkhorn algorithms for the bi-marginal setting, and greedy MultiSinkhorn for the general multimarginal case. We provide a comprehensive convergence analysis based on the properties of the iterative Bregman projections method with greedy control. Linear rate of convergence as well as explicit bounds on the iteration complexity are obtained. When specialized to the above mentioned algorithms, our results give new convergence rates or provide key improvements over the state-of-the-art rates. We present numerical experiments showing that the flexibility of the batch can be exploited to improve performance of Sinkhorn algorithm both in bi-marginal and multimarginal settings.", "bibtex": "@InProceedings{pmlr-v162-kostic22a,\n title = \t {Batch Greenkhorn Algorithm for Entropic-Regularized Multimarginal Optimal Transport: Linear Rate of Convergence and Iteration Complexity},\n author = {Kostic, Vladimir R. and Salzo, Saverio and Pontil, Massimiliano},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11529--11558},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kostic22a/kostic22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kostic22a.html},\n abstract = \t {In this work we propose a batch multimarginal version of the Greenkhorn algorithm for the entropic-regularized optimal transport problem. This framework is general enough to cover, as particular cases, existing Sinkhorn and Greenkhorn algorithms for the bi-marginal setting, and greedy MultiSinkhorn for the general multimarginal case. We provide a comprehensive convergence analysis based on the properties of the iterative Bregman projections method with greedy control. Linear rate of convergence as well as explicit bounds on the iteration complexity are obtained. When specialized to the above mentioned algorithms, our results give new convergence rates or provide key improvements over the state-of-the-art rates. We present numerical experiments showing that the flexibility of the batch can be exploited to improve performance of Sinkhorn algorithm both in bi-marginal and multimarginal settings.}\n}", "pdf": "https://proceedings.mlr.press/v162/kostic22a/kostic22a.pdf", "supp": "", "pdf_size": 2586344, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8452618603104081315&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Istituto Italiano di Tecnologia, Via Melen 83, 16152 Genova, Italy+Department of Mathematics and Informatics, Faculty of Science, University of Novi Sad, Trg Dositeja Obradovi\u0107a 4, 21000 Novi Sad, Serbia; Istituto Italiano di Tecnologia, Via Melen 83, 16152 Genova, Italy; Department of Computer Science, University College London, Gower Street WC1E 6BT, London, United Kingdom", "aff_domain": "iit.it;iit.it; ", "email": "iit.it;iit.it; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kostic22a.html", "aff_unique_index": "0+1;0;2", "aff_unique_norm": "Istituto Italiano di Tecnologia;University of Novi Sad;University College London", "aff_unique_dep": ";Department of Mathematics and Informatics;Department of Computer Science", "aff_unique_url": "https://www.iit.it;https://www.uns.ac.rs;https://www.ucl.ac.uk", "aff_unique_abbr": "IIT;UNS;UCL", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Novi Sad;London", "aff_country_unique_index": "0+1;0;2", "aff_country_unique": "Italy;Serbia;United Kingdom" }, { "title": "Batched Dueling Bandits", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18227", "id": "18227", "proceeding": "https://proceedings.mlr.press/v162/agarwal22a.html", "poster": "", "slides": "", "author_site": "Arpit Agarwal, Rohan Ghuge, viswanath nagarajan", "author": "Arpit Agarwal; Rohan Ghuge; Viswanath Nagarajan", "abstract": "The K-armed dueling bandit problem, where the feedback is in the form of noisy pairwise comparisons, has been widely studied. Previous works have only focused on the sequential setting where the policy adapts after every comparison. However, in many applications such as search ranking and recommendation systems, it is preferable to perform comparisons in a limited number of parallel batches. We study the batched K-armed dueling bandit problem under two standard settings: (i) existence of a Condorcet winner, and (ii) strong stochastic transitivity and stochastic triangle inequality. For both settings, we obtain algorithms with a smooth trade-off between the number of batches and regret. Our regret bounds match the best known sequential regret bounds (up to poly-logarithmic factors), using only a logarithmic number of batches. We complement our regret analysis with a nearly-matching lower bound. Finally, we also validate our theoretical results via experiments on synthetic and real data.", "bibtex": "@InProceedings{pmlr-v162-agarwal22a,\n title = \t {Batched Dueling Bandits},\n author = {Agarwal, Arpit and Ghuge, Rohan and Nagarajan, Viswanath},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {89--110},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/agarwal22a/agarwal22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/agarwal22a.html},\n abstract = \t {The K-armed dueling bandit problem, where the feedback is in the form of noisy pairwise comparisons, has been widely studied. Previous works have only focused on the sequential setting where the policy adapts after every comparison. However, in many applications such as search ranking and recommendation systems, it is preferable to perform comparisons in a limited number of parallel batches. We study the batched K-armed dueling bandit problem under two standard settings: (i) existence of a Condorcet winner, and (ii) strong stochastic transitivity and stochastic triangle inequality. For both settings, we obtain algorithms with a smooth trade-off between the number of batches and regret. Our regret bounds match the best known sequential regret bounds (up to poly-logarithmic factors), using only a logarithmic number of batches. We complement our regret analysis with a nearly-matching lower bound. Finally, we also validate our theoretical results via experiments on synthetic and real data.}\n}", "pdf": "https://proceedings.mlr.press/v162/agarwal22a/agarwal22a.pdf", "supp": "", "pdf_size": 530512, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1451930648124427550&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Data Science Institute, Columbia University, New York, USA; Department of Industrial and Operations Engineering, University of Michigan, Ann Arbor, USA; Department of Industrial and Operations Engineering, University of Michigan, Ann Arbor, USA", "aff_domain": "umich.edu; ; ", "email": "umich.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/agarwal22a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Columbia University;University of Michigan", "aff_unique_dep": "Data Science Institute;Department of Industrial and Operations Engineering", "aff_unique_url": "https://www.columbia.edu;https://www.umich.edu", "aff_unique_abbr": "Columbia;UM", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "New York;Ann Arbor", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Continuous-Time Tucker Decomposition", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16047", "id": "16047", "proceeding": "https://proceedings.mlr.press/v162/fang22b.html", "poster": "/media/PosterPDFs/ICML%202022/b4568df26077653eeadf29596708c94b_Cf0JjiU.png?t=1657212673.819191", "slides": "", "author_site": "Shikai Fang, Akil Narayan, Robert Kirby, Shandian Zhe", "author": "Shikai Fang; Akil Narayan; Robert Kirby; Shandian Zhe", "abstract": "Tensor decomposition is a dominant framework for multiway data analysis and prediction. Although practical data often contains timestamps for the observed entries, existing tensor decomposition approaches overlook or under-use this valuable time information. They either drop the timestamps or bin them into crude steps and hence ignore the temporal dynamics within each step or use simple parametric time coefficients. To overcome these limitations, we propose Bayesian Continuous-Time Tucker Decomposition. We model the tensor-core of the classical Tucker decomposition as a time-varying function, and place a Gaussian process prior to flexibly estimate all kinds of temporal dynamics. In this way, our model maintains the interpretability while is flexible enough to capture various complex temporal relationships between the tensor nodes. For efficient and high-quality posterior inference, we use the stochastic differential equation (SDE) representation of temporal GPs to build an equivalent state-space prior, which avoids huge kernel matrix computation and sparse/low-rank approximations. We then use Kalman filtering, RTS smoothing, and conditional moment matching to develop a scalable message passing inference algorithm. We show the advantage of our method in simulation and several real-world applications.", "bibtex": "@InProceedings{pmlr-v162-fang22b,\n title = \t {{B}ayesian Continuous-Time Tucker Decomposition},\n author = {Fang, Shikai and Narayan, Akil and Kirby, Robert and Zhe, Shandian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6235--6245},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fang22b/fang22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/fang22b.html},\n abstract = \t {Tensor decomposition is a dominant framework for multiway data analysis and prediction. Although practical data often contains timestamps for the observed entries, existing tensor decomposition approaches overlook or under-use this valuable time information. They either drop the timestamps or bin them into crude steps and hence ignore the temporal dynamics within each step or use simple parametric time coefficients. To overcome these limitations, we propose Bayesian Continuous-Time Tucker Decomposition. We model the tensor-core of the classical Tucker decomposition as a time-varying function, and place a Gaussian process prior to flexibly estimate all kinds of temporal dynamics. In this way, our model maintains the interpretability while is flexible enough to capture various complex temporal relationships between the tensor nodes. For efficient and high-quality posterior inference, we use the stochastic differential equation (SDE) representation of temporal GPs to build an equivalent state-space prior, which avoids huge kernel matrix computation and sparse/low-rank approximations. We then use Kalman filtering, RTS smoothing, and conditional moment matching to develop a scalable message passing inference algorithm. We show the advantage of our method in simulation and several real-world applications.}\n}", "pdf": "https://proceedings.mlr.press/v162/fang22b/fang22b.pdf", "supp": "", "pdf_size": 641761, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9185267639661273634&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "School of Computing, University of Utah+Scientific Computing and Imaging (SCI) Institute, University of Utah; Scientific Computing and Imaging (SCI) Institute, University of Utah+Department of Mathematics, University of Utah; School of Computing, University of Utah+Scientific Computing and Imaging (SCI) Institute, University of Utah; School of Computing, University of Utah", "aff_domain": "cs.utah.edu;sci.utah.edu;cs.utah.edu;cs.utah.edu", "email": "cs.utah.edu;sci.utah.edu;cs.utah.edu;cs.utah.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/fang22b.html", "aff_unique_index": "0+0;0+0;0+0;0", "aff_unique_norm": "University of Utah", "aff_unique_dep": "School of Computing", "aff_unique_url": "https://www.utah.edu", "aff_unique_abbr": "U of U", "aff_campus_unique_index": "0+1;1;0+1;0", "aff_campus_unique": "Utah;Salt Lake City;", "aff_country_unique_index": "0+0;0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Deep Embedding Topic Meta-Learner", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18297", "id": "18297", "proceeding": "https://proceedings.mlr.press/v162/duan22d.html", "poster": "", "slides": "", "author_site": "Zhibin Duan, Yishi Xu, Jianqiao Sun, Bo Chen, Wenchao Chen, CHAOJIE WANG, Mingyuan Zhou", "author": "Zhibin Duan; Yishi Xu; Jianqiao Sun; Bo Chen; Wenchao Chen; Chaojie Wang; Mingyuan Zhou", "abstract": "Existing deep topic models are effective in capturing the latent semantic structures in textual data but usually rely on a plethora of documents. This is less than satisfactory in practical applications when only a limited amount of data is available. In this paper, we propose a novel framework that efficiently solves the problem of topic modeling under the small data regime. Specifically, the framework involves two innovations: a bi-level generative model that aims to exploit the task information to guide the document generation, and a topic meta-learner that strives to learn a group of global topic embeddings so that fast adaptation to the task-specific topic embeddings can be achieved with a few examples. We apply the proposed framework to a hierarchical embedded topic model and achieve better performance than various baseline models on diverse experiments, including few-shot topic discovery and few-shot document classification.", "bibtex": "@InProceedings{pmlr-v162-duan22d,\n title = \t {{B}ayesian Deep Embedding Topic Meta-Learner},\n author = {Duan, Zhibin and Xu, Yishi and Sun, Jianqiao and Chen, Bo and Chen, Wenchao and Wang, Chaojie and Zhou, Mingyuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5659--5670},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/duan22d/duan22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/duan22d.html},\n abstract = \t {Existing deep topic models are effective in capturing the latent semantic structures in textual data but usually rely on a plethora of documents. This is less than satisfactory in practical applications when only a limited amount of data is available. In this paper, we propose a novel framework that efficiently solves the problem of topic modeling under the small data regime. Specifically, the framework involves two innovations: a bi-level generative model that aims to exploit the task information to guide the document generation, and a topic meta-learner that strives to learn a group of global topic embeddings so that fast adaptation to the task-specific topic embeddings can be achieved with a few examples. We apply the proposed framework to a hierarchical embedded topic model and achieve better performance than various baseline models on diverse experiments, including few-shot topic discovery and few-shot document classification.}\n}", "pdf": "https://proceedings.mlr.press/v162/duan22d/duan22d.pdf", "supp": "", "pdf_size": 830234, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7389043230150399813&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "National Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, China; National Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, China; National Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, China; National Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, China; National Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, China; National Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, China; McCombs School of Business, The University of Texas at Austin, Austin, TX 78712, USA", "aff_domain": "mail.xidian.edu.cn; ; ; ; ; ; ", "email": "mail.xidian.edu.cn; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/duan22d.html", "aff_unique_index": "0;0;0;0;0;0;1", "aff_unique_norm": "Xidian University;University of Texas at Austin", "aff_unique_dep": "National Laboratory of Radar Signal Processing;McCombs School of Business", "aff_unique_url": "http://www.xidian.edu.cn/;https://www.mccombs.utexas.edu", "aff_unique_abbr": "Xidian;UT Austin", "aff_campus_unique_index": "0;0;0;0;0;0;1", "aff_campus_unique": "Xi'an;Austin", "aff_country_unique_index": "0;0;0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Bayesian Imitation Learning for End-to-End Mobile Manipulation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16663", "id": "16663", "proceeding": "https://proceedings.mlr.press/v162/du22b.html", "poster": "/media/PosterPDFs/ICML%202022/cdf28f8b7d14ab02d12a2329d71e4079.png?t=1656687625.846664", "slides": "", "author_site": "Yuqing Du, Daniel Ho, Alexander Alemi, Eric Jang, Mohi Khansari", "author": "Yuqing Du; Daniel Ho; Alex Alemi; Eric Jang; Mohi Khansari", "abstract": "In this work we investigate and demonstrate benefits of a Bayesian approach to imitation learning from multiple sensor inputs, as applied to the task of opening office doors with a mobile manipulator. Augmenting policies with additional sensor inputs{\u2014}such as RGB + depth cameras{\u2014}is a straightforward approach to improving robot perception capabilities, especially for tasks that may favor different sensors in different situations. As we scale multi-sensor robotic learning to unstructured real-world settings (e.g. offices, homes) and more complex robot behaviors, we also increase reliance on simulators for cost, efficiency, and safety. Consequently, the sim-to-real gap across multiple sensor modalities also increases, making simulated validation more difficult. We show that using the Variational Information Bottleneck (Alemi et al., 2016) to regularize convolutional neural networks improves generalization to heldout domains and reduces the sim-to-real gap in a sensor-agnostic manner. As a side effect, the learned embeddings also provide useful estimates of model uncertainty for each sensor. We demonstrate that our method is able to help close the sim-to-real gap and successfully fuse RGB and depth modalities based on understanding of the situational uncertainty of each sensor. In a real-world office environment, we achieve 96% task success, improving upon the baseline by +16%.", "bibtex": "@InProceedings{pmlr-v162-du22b,\n title = \t {{B}ayesian Imitation Learning for End-to-End Mobile Manipulation},\n author = {Du, Yuqing and Ho, Daniel and Alemi, Alex and Jang, Eric and Khansari, Mohi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5531--5546},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/du22b/du22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/du22b.html},\n abstract = \t {In this work we investigate and demonstrate benefits of a Bayesian approach to imitation learning from multiple sensor inputs, as applied to the task of opening office doors with a mobile manipulator. Augmenting policies with additional sensor inputs{\u2014}such as RGB + depth cameras{\u2014}is a straightforward approach to improving robot perception capabilities, especially for tasks that may favor different sensors in different situations. As we scale multi-sensor robotic learning to unstructured real-world settings (e.g. offices, homes) and more complex robot behaviors, we also increase reliance on simulators for cost, efficiency, and safety. Consequently, the sim-to-real gap across multiple sensor modalities also increases, making simulated validation more difficult. We show that using the Variational Information Bottleneck (Alemi et al., 2016) to regularize convolutional neural networks improves generalization to heldout domains and reduces the sim-to-real gap in a sensor-agnostic manner. As a side effect, the learned embeddings also provide useful estimates of model uncertainty for each sensor. We demonstrate that our method is able to help close the sim-to-real gap and successfully fuse RGB and depth modalities based on understanding of the situational uncertainty of each sensor. In a real-world office environment, we achieve 96% task success, improving upon the baseline by +16%.}\n}", "pdf": "https://proceedings.mlr.press/v162/du22b/du22b.pdf", "supp": "", "pdf_size": 5377298, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16244909318168426513&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "UC Berkeley + Everyday Robots; Everyday Robots; Google Research; Google; Everyday Robots", "aff_domain": "berkeley.edu; ; ; ; ", "email": "berkeley.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/du22b.html", "aff_unique_index": "0+1;1;2;2;1", "aff_unique_norm": "University of California, Berkeley;Everyday Robots;Google", "aff_unique_dep": ";;Google Research", "aff_unique_url": "https://www.berkeley.edu;https://www.everydayrobots.com;https://research.google", "aff_unique_abbr": "UC Berkeley;;Google Research", "aff_campus_unique_index": "0;2;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Learning with Information Gain Provably Bounds Risk for a Robust Adversarial Defense", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18397", "id": "18397", "proceeding": "https://proceedings.mlr.press/v162/doan22a.html", "poster": "/media/PosterPDFs/ICML%202022/b9f94c77652c9a76fc8a442748cd54bd_N0qdRle.png?t=1657866160.532523", "slides": "", "author_site": "Bao Gia Doan, Ehsan Abbasnejad, Javen Qinfeng Shi, Damith Ranashinghe", "author": "Bao Gia Doan; Ehsan M. Abbasnejad; Javen Qinfeng Shi; Damith C. Ranasinghe", "abstract": "We present a new algorithm to learn a deep neural network model robust against adversarial attacks. Previous algorithms demonstrate an adversarially trained Bayesian Neural Network (BNN) provides improved robustness. We recognize the learning approach for approximating the multi-modal posterior distribution of an adversarially trained Bayesian model can lead to mode collapse; consequently, the model\u2019s achievements in robustness and performance are sub-optimal. Instead, we first propose preventing mode collapse to better approximate the multi-modal posterior distribution. Second, based on the intuition that a robust model should ignore perturbations and only consider the informative content of the input, we conceptualize and formulate an information gain objective to measure and force the information learned from both benign and adversarial training instances to be similar. Importantly. we prove and demonstrate that minimizing the information gain objective allows the adversarial risk to approach the conventional empirical risk. We believe our efforts provide a step towards a basis for a principled method of adversarially training BNNs. Our extensive experimental results demonstrate significantly improved robustness up to 20% compared with adversarial training and Adv-BNN under PGD attacks with 0.035 distortion on both CIFAR-10 and STL-10 dataset.", "bibtex": "@InProceedings{pmlr-v162-doan22a,\n title = \t {{B}ayesian Learning with Information Gain Provably Bounds Risk for a Robust Adversarial Defense},\n author = {Doan, Bao Gia and Abbasnejad, Ehsan M. and Shi, Javen Qinfeng and Ranasinghe C., Damith},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5309--5323},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/doan22a/doan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/doan22a.html},\n abstract = \t {We present a new algorithm to learn a deep neural network model robust against adversarial attacks. Previous algorithms demonstrate an adversarially trained Bayesian Neural Network (BNN) provides improved robustness. We recognize the learning approach for approximating the multi-modal posterior distribution of an adversarially trained Bayesian model can lead to mode collapse; consequently, the model\u2019s achievements in robustness and performance are sub-optimal. Instead, we first propose preventing mode collapse to better approximate the multi-modal posterior distribution. Second, based on the intuition that a robust model should ignore perturbations and only consider the informative content of the input, we conceptualize and formulate an information gain objective to measure and force the information learned from both benign and adversarial training instances to be similar. Importantly. we prove and demonstrate that minimizing the information gain objective allows the adversarial risk to approach the conventional empirical risk. We believe our efforts provide a step towards a basis for a principled method of adversarially training BNNs. Our extensive experimental results demonstrate significantly improved robustness up to 20% compared with adversarial training and Adv-BNN under PGD attacks with 0.035 distortion on both CIFAR-10 and STL-10 dataset.}\n}", "pdf": "https://proceedings.mlr.press/v162/doan22a/doan22a.pdf", "supp": "", "pdf_size": 1220628, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3769346561128501605&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "School of Computer Science, University of Adelaide, SA, Australia; School of Computer Science, University of Adelaide, SA, Australia; School of Computer Science, University of Adelaide, SA, Australia; School of Computer Science, University of Adelaide, SA, Australia", "aff_domain": "adelaide.edu.au; ; ; ", "email": "adelaide.edu.au; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/doan22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Adelaide", "aff_unique_dep": "School of Computer Science", "aff_unique_url": "https://www.adelaide.edu.au", "aff_unique_abbr": "Adelaide", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Adelaide", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Australia" }, { "title": "Bayesian Model Selection, the Marginal Likelihood, and Generalization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17991", "id": "17991", "proceeding": "https://proceedings.mlr.press/v162/lotfi22a.html", "poster": "/media/PosterPDFs/ICML%202022/07b6f0c87d1dc9f9ab8e8543b60a419a.png?t=1658322881.4565783", "slides": "", "author_site": "Sanae Lotfi, Pavel Izmailov, Gregory Benton, Micah Goldblum, Andrew Wilson", "author": "Sanae Lotfi; Pavel Izmailov; Gregory Benton; Micah Goldblum; Andrew Gordon Wilson", "abstract": "How do we compare between hypotheses that are entirely consistent with observations? The marginal likelihood (aka Bayesian evidence), which represents the probability of generating our observations from a prior, provides a distinctive approach to this foundational question, automatically encoding Occam\u2019s razor. Although it has been observed that the marginal likelihood can overfit and is sensitive to prior assumptions, its limitations for hyperparameter learning and discrete model comparison have not been thoroughly investigated. We first revisit the appealing properties of the marginal likelihood for learning constraints and hypothesis testing. We then highlight the conceptual and practical issues in using the marginal likelihood as a proxy for generalization. Namely, we show how marginal likelihood can be negatively correlated with generalization, with implications for neural architecture search, and can lead to both underfitting and overfitting in hyperparameter learning. We provide a partial remedy through a conditional marginal likelihood, which we show is more aligned with generalization, and practically valuable for large-scale hyperparameter learning, such as in deep kernel learning.", "bibtex": "@InProceedings{pmlr-v162-lotfi22a,\n title = \t {{B}ayesian Model Selection, the Marginal Likelihood, and Generalization},\n author = {Lotfi, Sanae and Izmailov, Pavel and Benton, Gregory and Goldblum, Micah and Wilson, Andrew Gordon},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14223--14247},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lotfi22a/lotfi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lotfi22a.html},\n abstract = \t {How do we compare between hypotheses that are entirely consistent with observations? The marginal likelihood (aka Bayesian evidence), which represents the probability of generating our observations from a prior, provides a distinctive approach to this foundational question, automatically encoding Occam\u2019s razor. Although it has been observed that the marginal likelihood can overfit and is sensitive to prior assumptions, its limitations for hyperparameter learning and discrete model comparison have not been thoroughly investigated. We first revisit the appealing properties of the marginal likelihood for learning constraints and hypothesis testing. We then highlight the conceptual and practical issues in using the marginal likelihood as a proxy for generalization. Namely, we show how marginal likelihood can be negatively correlated with generalization, with implications for neural architecture search, and can lead to both underfitting and overfitting in hyperparameter learning. We provide a partial remedy through a conditional marginal likelihood, which we show is more aligned with generalization, and practically valuable for large-scale hyperparameter learning, such as in deep kernel learning.}\n}", "pdf": "https://proceedings.mlr.press/v162/lotfi22a/lotfi22a.pdf", "supp": "", "pdf_size": 1303492, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9966221610854779885&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "New York University; New York University; New York University; New York University; New York University", "aff_domain": "nyu.edu; ; ; ;cims.nyu.edu", "email": "nyu.edu; ; ; ;cims.nyu.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/lotfi22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Nonparametric Learning for Point Processes with Spatial Homogeneity: A Spatial Analysis of NBA Shot Locations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15957", "id": "15957", "proceeding": "https://proceedings.mlr.press/v162/yin22a.html", "poster": "/media/PosterPDFs/ICML%202022/a5e00132373a7031000fd987a3c9f87b.png?t=1657829708.2033858", "slides": "/media/icml-2022/Slides/15957_umEIaLy.pdf", "author_site": "Fan Yin, Jieying Jiao, Jun Yan, Guanyu Hu", "author": "Fan Yin; Jieying Jiao; Jun Yan; Guanyu Hu", "abstract": "Basketball shot location data provide valuable summary information regarding players to coaches, sports analysts, fans, statisticians, as well as players themselves. Represented by spatial points, such data are naturally analyzed with spatial point process models. We present a novel nonparametric Bayesian method for learning the underlying intensity surface built upon a combination of Dirichlet process and Markov random field. Our method has the advantage of effectively encouraging local spatial homogeneity when estimating a globally heterogeneous intensity surface. Posterior inferences are performed with an efficient Markov chain Monte Carlo (MCMC) algorithm. Simulation studies show that the inferences are accurate and the method is superior compared to a wide range of competing methods. Application to the shot location data of $20$ representative NBA players in the 2017-2018 regular season offers interesting insights about the shooting patterns of these players. A comparison against the competing method shows that the proposed method can effectively incorporate spatial contiguity into the estimation of intensity surfaces.", "bibtex": "@InProceedings{pmlr-v162-yin22a,\n title = \t {{B}ayesian Nonparametric Learning for Point Processes with Spatial Homogeneity: A Spatial Analysis of {NBA} Shot Locations},\n author = {Yin, Fan and Jiao, Jieying and Yan, Jun and Hu, Guanyu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25523--25551},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yin22a/yin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yin22a.html},\n abstract = \t {Basketball shot location data provide valuable summary information regarding players to coaches, sports analysts, fans, statisticians, as well as players themselves. Represented by spatial points, such data are naturally analyzed with spatial point process models. We present a novel nonparametric Bayesian method for learning the underlying intensity surface built upon a combination of Dirichlet process and Markov random field. Our method has the advantage of effectively encouraging local spatial homogeneity when estimating a globally heterogeneous intensity surface. Posterior inferences are performed with an efficient Markov chain Monte Carlo (MCMC) algorithm. Simulation studies show that the inferences are accurate and the method is superior compared to a wide range of competing methods. Application to the shot location data of $20$ representative NBA players in the 2017-2018 regular season offers interesting insights about the shooting patterns of these players. A comparison against the competing method shows that the proposed method can effectively incorporate spatial contiguity into the estimation of intensity surfaces.}\n}", "pdf": "https://proceedings.mlr.press/v162/yin22a/yin22a.pdf", "supp": "", "pdf_size": 4819091, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7157348093952354556&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Statistics, University of California, Irvine, CA, USA+Department of Statistics, University of Connecticut, Storrs, CT, USA; Department of Statistics, University of Connecticut, Storrs, CT, USA; Department of Statistics, University of Connecticut, Storrs, CT, USA; Department of Statistics, University of Missouri, Columbia, MO, USA", "aff_domain": "missouri.edu; ; ;missouri.edu", "email": "missouri.edu; ; ;missouri.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/yin22a.html", "aff_unique_index": "0+1;1;1;2", "aff_unique_norm": "University of California, Irvine;University of Connecticut;University of Missouri", "aff_unique_dep": "Department of Statistics;Department of Statistics;Department of Statistics", "aff_unique_url": "https://www.uci.edu;https://www.uconn.edu;https://www.missouri.edu", "aff_unique_abbr": "UCI;UConn;MU", "aff_campus_unique_index": "0+1;1;1;2", "aff_campus_unique": "Irvine;Storrs;Columbia", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Nonparametrics for Offline Skill Discovery", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16269", "id": "16269", "proceeding": "https://proceedings.mlr.press/v162/villecroze22a.html", "poster": "/media/PosterPDFs/ICML%202022/e661551c8ec9308379cda7e2419348e5.png?t=1657639239.5547016", "slides": "", "author_site": "Valentin Villecroze, Harry Braviner, Panteha Naderian, Chris Maddison, Gabriel Loaiza-Ganem", "author": "Valentin Villecroze; Harry Braviner; Panteha Naderian; Chris Maddison; Gabriel Loaiza-Ganem", "abstract": "Skills or low-level policies in reinforcement learning are temporally extended actions that can speed up learning and enable complex behaviours. Recent work in offline reinforcement learning and imitation learning has proposed several techniques for skill discovery from a set of expert trajectories. While these methods are promising, the number K of skills to discover is always a fixed hyperparameter, which requires either prior knowledge about the environment or an additional parameter search to tune it. We first propose a method for offline learning of options (a particular skill framework) exploiting advances in variational inference and continuous relaxations. We then highlight an unexplored connection between Bayesian nonparametrics and offline skill discovery, and show how to obtain a nonparametric version of our model. This version is tractable thanks to a carefully structured approximate posterior with a dynamically-changing number of options, removing the need to specify K. We also show how our nonparametric extension can be applied in other skill frameworks, and empirically demonstrate that our method can outperform state-of-the-art offline skill learning algorithms across a variety of environments.", "bibtex": "@InProceedings{pmlr-v162-villecroze22a,\n title = \t {{B}ayesian Nonparametrics for Offline Skill Discovery},\n author = {Villecroze, Valentin and Braviner, Harry and Naderian, Panteha and Maddison, Chris and Loaiza-Ganem, Gabriel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22284--22299},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/villecroze22a/villecroze22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/villecroze22a.html},\n abstract = \t {Skills or low-level policies in reinforcement learning are temporally extended actions that can speed up learning and enable complex behaviours. Recent work in offline reinforcement learning and imitation learning has proposed several techniques for skill discovery from a set of expert trajectories. While these methods are promising, the number K of skills to discover is always a fixed hyperparameter, which requires either prior knowledge about the environment or an additional parameter search to tune it. We first propose a method for offline learning of options (a particular skill framework) exploiting advances in variational inference and continuous relaxations. We then highlight an unexplored connection between Bayesian nonparametrics and offline skill discovery, and show how to obtain a nonparametric version of our model. This version is tractable thanks to a carefully structured approximate posterior with a dynamically-changing number of options, removing the need to specify K. We also show how our nonparametric extension can be applied in other skill frameworks, and empirically demonstrate that our method can outperform state-of-the-art offline skill learning algorithms across a variety of environments.}\n}", "pdf": "https://proceedings.mlr.press/v162/villecroze22a/villecroze22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/villecroze22a-supp.zip", "pdf_size": 1093109, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5074347961003664860&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Layer 6 AI, Toronto, Canada+University of Toronto, Toronto, Canada; Layer 6 AI, Toronto, Canada; Layer 6 AI, Toronto, Canada; University of Toronto, Toronto, Canada+Vector Institute, Toronto, Canada; Layer 6 AI, Toronto, Canada", "aff_domain": "layer6.ai;layer6.ai;layer6.ai;cs.toronto.edu;layer6.ai", "email": "layer6.ai;layer6.ai;layer6.ai;cs.toronto.edu;layer6.ai", "github": "https://github.com/layer6ai-labs/BNPO", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/villecroze22a.html", "aff_unique_index": "0+1;0;0;1+2;0", "aff_unique_norm": "Layer 6 AI;University of Toronto;Vector Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://layer6.ai;https://www.utoronto.ca;https://vectorinstitute.ai", "aff_unique_abbr": ";U of T;Vector Institute", "aff_campus_unique_index": "0+0;0;0;0+0;0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0+0;0;0;0+0;0", "aff_country_unique": "Canada" }, { "title": "Bayesian Optimization for Distributionally Robust Chance-constrained Problem", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17799", "id": "17799", "proceeding": "https://proceedings.mlr.press/v162/inatsu22a.html", "poster": "/media/PosterPDFs/ICML%202022/89d9c467c2926de8ef12d1f3e006d06b.png?t=1657785684.0649629", "slides": "", "author_site": "Yu Inatsu, Shion Takeno, Masayuki Karasuyama, Ichiro Takeuchi", "author": "Yu Inatsu; Shion Takeno; Masayuki Karasuyama; Ichiro Takeuchi", "abstract": "In black-box function optimization, we need to consider not only controllable design variables but also uncontrollable stochastic environment variables. In such cases, it is necessary to solve the optimization problem by taking into account the uncertainty of the environmental variables. Chance-constrained (CC) problem, the problem of maximizing the expected value under a certain level of constraint satisfaction probability, is one of the practically important problems in the presence of environmental variables. In this study, we consider distributionally robust CC (DRCC) problem and propose a novel DRCC Bayesian optimization method for the case where the distribution of the environmental variables cannot be precisely specified. We show that the proposed method can find an arbitrary accurate solution with high probability in a finite number of trials, and confirm the usefulness of the proposed method through numerical experiments.", "bibtex": "@InProceedings{pmlr-v162-inatsu22a,\n title = \t {{B}ayesian Optimization for Distributionally Robust Chance-constrained Problem},\n author = {Inatsu, Yu and Takeno, Shion and Karasuyama, Masayuki and Takeuchi, Ichiro},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9602--9621},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/inatsu22a/inatsu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/inatsu22a.html},\n abstract = \t {In black-box function optimization, we need to consider not only controllable design variables but also uncontrollable stochastic environment variables. In such cases, it is necessary to solve the optimization problem by taking into account the uncertainty of the environmental variables. Chance-constrained (CC) problem, the problem of maximizing the expected value under a certain level of constraint satisfaction probability, is one of the practically important problems in the presence of environmental variables. In this study, we consider distributionally robust CC (DRCC) problem and propose a novel DRCC Bayesian optimization method for the case where the distribution of the environmental variables cannot be precisely specified. We show that the proposed method can find an arbitrary accurate solution with high probability in a finite number of trials, and confirm the usefulness of the proposed method through numerical experiments.}\n}", "pdf": "https://proceedings.mlr.press/v162/inatsu22a/inatsu22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/inatsu22a-supp.zip", "pdf_size": 547736, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17253226262459339725&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Nagoya Institute of Technology, Aichi, Japan; Department of Computer Science, Nagoya Institute of Technology, Aichi, Japan; Department of Computer Science, Nagoya Institute of Technology, Aichi, Japan; Department of Computer Science, Nagoya Institute of Technology, Aichi, Japan + RIKEN Center for Advanced Intelligence Project, Tokyo, Japan", "aff_domain": "nitech.ac.jp; ; ; ", "email": "nitech.ac.jp; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/inatsu22a.html", "aff_unique_index": "0;0;0;0+1", "aff_unique_norm": "Nagoya Institute of Technology;RIKEN Center for Advanced Intelligence Project", "aff_unique_dep": "Department of Computer Science;Center for Advanced Intelligence Project", "aff_unique_url": "https://www.nitech.ac.jp;https://www.riken.jp/en/c-aip/", "aff_unique_abbr": "NIT;RIKEN C-AIP", "aff_campus_unique_index": "0;0;0;0+1", "aff_campus_unique": "Nagoya;Tokyo", "aff_country_unique_index": "0;0;0;0+0", "aff_country_unique": "Japan" }, { "title": "Bayesian Optimization under Stochastic Delayed Feedback", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17789", "id": "17789", "proceeding": "https://proceedings.mlr.press/v162/verma22a.html", "poster": "/media/PosterPDFs/ICML%202022/54b2b21af94108d83c2a909d5b0a6a50.png?t=1657701350.668604", "slides": "", "author_site": "Arun Verma, Zhongxiang Dai, Bryan Kian Hsiang Low", "author": "Arun Verma; Zhongxiang Dai; Bryan Kian Hsiang Low", "abstract": "Bayesian optimization (BO) is a widely-used sequential method for zeroth-order optimization of complex and expensive-to-compute black-box functions. The existing BO methods assume that the function evaluation (feedback) is available to the learner immediately or after a fixed delay. Such assumptions may not be practical in many real-life problems like online recommendations, clinical trials, and hyperparameter tuning where feedback is available after a random delay. To benefit from the experimental parallelization in these problems, the learner needs to start new function evaluations without waiting for delayed feedback. In this paper, we consider the BO under stochastic delayed feedback problem. We propose algorithms with sub-linear regret guarantees that efficiently address the dilemma of selecting new function queries while waiting for randomly delayed feedback. Building on our results, we also make novel contributions to batch BO and contextual Gaussian process bandits. Experiments on synthetic and real-life datasets verify the performance of our algorithms.", "bibtex": "@InProceedings{pmlr-v162-verma22a,\n title = \t {{B}ayesian Optimization under Stochastic Delayed Feedback},\n author = {Verma, Arun and Dai, Zhongxiang and Low, Bryan Kian Hsiang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22145--22167},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/verma22a/verma22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/verma22a.html},\n abstract = \t {Bayesian optimization (BO) is a widely-used sequential method for zeroth-order optimization of complex and expensive-to-compute black-box functions. The existing BO methods assume that the function evaluation (feedback) is available to the learner immediately or after a fixed delay. Such assumptions may not be practical in many real-life problems like online recommendations, clinical trials, and hyperparameter tuning where feedback is available after a random delay. To benefit from the experimental parallelization in these problems, the learner needs to start new function evaluations without waiting for delayed feedback. In this paper, we consider the BO under stochastic delayed feedback problem. We propose algorithms with sub-linear regret guarantees that efficiently address the dilemma of selecting new function queries while waiting for randomly delayed feedback. Building on our results, we also make novel contributions to batch BO and contextual Gaussian process bandits. Experiments on synthetic and real-life datasets verify the performance of our algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/verma22a/verma22a.pdf", "supp": "", "pdf_size": 768607, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2047978872599883537&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, National University of Singapore, Republic of Singapore; Department of Computer Science, National University of Singapore, Republic of Singapore; Department of Computer Science, National University of Singapore, Republic of Singapore", "aff_domain": "gmail.com;gmail.com; ", "email": "gmail.com;gmail.com; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/verma22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "Be Like Water: Adaptive Floating Point for Machine Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17287", "id": "17287", "proceeding": "https://proceedings.mlr.press/v162/yeh22a.html", "poster": "/media/PosterPDFs/ICML%202022/96ea64f3a1aa2fd00c72faacf0cb8ac9_aJtn0XT.png?t=1657694055.476772", "slides": "", "author_site": "Thomas Y. Yeh, Maxwell Sterner, Zerlina Lai, Brandon Chuang, Alexander Ihler", "author": "Thomas Yeh; Max Sterner; Zerlina Lai; Brandon Chuang; Alexander Ihler", "abstract": "In the pursuit of optimizing memory and compute density to accelerate machine learning applications, reduced precision training and inference has been an active area of research. While some approaches selectively apply low precision computations, this may require costly off-chip data transfers or mixed precision support. In this paper, we propose a novel numerical representation, Adaptive Floating Point (AFP), that dynamically adjusts to the characteristics of deep learning data. AFP requires no changes to the model topology, requires no additional training, and applies to all layers of DNN models. We evaluate AFP on a spectrum of representative models in computer vision and NLP, and show that our technique enables ultra-low precision inference of deep learning models while providing accuracy comparable to full precision inference. By dynamically adjusting to ML data, AFP increases memory density by 1.6x, 1.6x, and 3.2x and compute density by 4x, 1.3x, and 12x when compared to BFP, BFloat16, and FP32.", "bibtex": "@InProceedings{pmlr-v162-yeh22a,\n title = \t {Be Like Water: Adaptive Floating Point for Machine Learning},\n author = {Yeh, Thomas and Sterner, Max and Lai, Zerlina and Chuang, Brandon and Ihler, Alexander},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25490--25500},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yeh22a/yeh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yeh22a.html},\n abstract = \t {In the pursuit of optimizing memory and compute density to accelerate machine learning applications, reduced precision training and inference has been an active area of research. While some approaches selectively apply low precision computations, this may require costly off-chip data transfers or mixed precision support. In this paper, we propose a novel numerical representation, Adaptive Floating Point (AFP), that dynamically adjusts to the characteristics of deep learning data. AFP requires no changes to the model topology, requires no additional training, and applies to all layers of DNN models. We evaluate AFP on a spectrum of representative models in computer vision and NLP, and show that our technique enables ultra-low precision inference of deep learning models while providing accuracy comparable to full precision inference. By dynamically adjusting to ML data, AFP increases memory density by 1.6x, 1.6x, and 3.2x and compute density by 4x, 1.3x, and 12x when compared to BFP, BFloat16, and FP32.}\n}", "pdf": "https://proceedings.mlr.press/v162/yeh22a/yeh22a.pdf", "supp": "", "pdf_size": 2117028, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10229675243359083740&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Computer Science Department, Pomona College, Claremont, CA, USA; Computer Science Department, Pomona College, Claremont, CA, USA; Computer Science Department, Occidental College, Los Angeles, CA, USA; Computer Science Department, University of California, Santa Cruz, CA, USA; Department of Computer Science, University of California, Irvine, CA, USA", "aff_domain": "gmail.com; ; ; ; ", "email": "gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/yeh22a.html", "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "Pomona College;Occidental College;University of California, Santa Cruz;University of California, Irvine", "aff_unique_dep": "Computer Science Department;Computer Science Department;Computer Science Department;Department of Computer Science", "aff_unique_url": "https://www.pomona.edu;https://www.oxy.edu;https://www.ucsc.edu;https://www.uci.edu", "aff_unique_abbr": "Pomona College;Oxy;UCSC;UCI", "aff_campus_unique_index": "0;0;1;2;3", "aff_campus_unique": "Claremont;Los Angeles;Santa Cruz;Irvine", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Being Properly Improper", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16141", "id": "16141", "proceeding": "https://proceedings.mlr.press/v162/sypherd22a.html", "poster": "/media/PosterPDFs/ICML%202022/e7e23670481ac78b3c4122a99ba60573.png?t=1657560426.3838089", "slides": "", "author_site": "Tyler Sypherd, Richard Nock, Lalitha Sankar", "author": "Tyler Sypherd; Richard Nock; Lalitha Sankar", "abstract": "Properness for supervised losses stipulates that the loss function shapes the learning algorithm towards the true posterior of the data generating distribution. Unfortunately, data in modern machine learning can be corrupted or twisted in many ways. Hence, optimizing a proper loss function on twisted data could perilously lead the learning algorithm towards the twisted posterior, rather than to the desired clean posterior. Many papers cope with specific twists (e.g., label/feature/adversarial noise), but there is a growing need for a unified and actionable understanding atop properness. Our chief theoretical contribution is a generalization of the properness framework with a notion called twist-properness, which delineates loss functions with the ability to \"untwist\" the twisted posterior into the clean posterior. Notably, we show that a nontrivial extension of a loss function called alpha-loss, which was first introduced in information theory, is twist-proper. We study the twist-proper alpha-loss under a novel boosting algorithm, called PILBoost, and provide formal and experimental results for this algorithm. Our overarching practical conclusion is that the twist-proper alpha-loss outperforms the proper log-loss on several variants of twisted data.", "bibtex": "@InProceedings{pmlr-v162-sypherd22a,\n title = \t {Being Properly Improper},\n author = {Sypherd, Tyler and Nock, Richard and Sankar, Lalitha},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20891--20932},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sypherd22a/sypherd22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sypherd22a.html},\n abstract = \t {Properness for supervised losses stipulates that the loss function shapes the learning algorithm towards the true posterior of the data generating distribution. Unfortunately, data in modern machine learning can be corrupted or twisted in many ways. Hence, optimizing a proper loss function on twisted data could perilously lead the learning algorithm towards the twisted posterior, rather than to the desired clean posterior. Many papers cope with specific twists (e.g., label/feature/adversarial noise), but there is a growing need for a unified and actionable understanding atop properness. Our chief theoretical contribution is a generalization of the properness framework with a notion called twist-properness, which delineates loss functions with the ability to \"untwist\" the twisted posterior into the clean posterior. Notably, we show that a nontrivial extension of a loss function called alpha-loss, which was first introduced in information theory, is twist-proper. We study the twist-proper alpha-loss under a novel boosting algorithm, called PILBoost, and provide formal and experimental results for this algorithm. Our overarching practical conclusion is that the twist-proper alpha-loss outperforms the proper log-loss on several variants of twisted data.}\n}", "pdf": "https://proceedings.mlr.press/v162/sypherd22a/sypherd22a.pdf", "supp": "", "pdf_size": 14143179, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10415401689503925451&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "School of Electrical, Computer and Energy Engineering, Arizona State University; Google Research; School of Electrical, Computer and Energy Engineering, Arizona State University", "aff_domain": "asu.edu; ; ", "email": "asu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sypherd22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Arizona State University;Google", "aff_unique_dep": "School of Electrical, Computer and Energy Engineering;Google Research", "aff_unique_url": "https://www.asu.edu;https://research.google", "aff_unique_abbr": "ASU;Google Research", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Benchmarking and Analyzing Point Cloud Classification under Corruptions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18377", "id": "18377", "proceeding": "https://proceedings.mlr.press/v162/ren22c.html", "poster": "/media/PosterPDFs/ICML%202022/b5dc4e5d9b495d0196f61d45b26ef33e.png?t=1658115096.8160126", "slides": "", "author_site": "Jiawei Ren, Liang Pan, Ziwei Liu", "author": "Jiawei Ren; Liang Pan; Ziwei Liu", "abstract": "3D perception, especially point cloud classification, has achieved substantial progress. However, in real-world deployment, point cloud corruptions are inevitable due to the scene complexity, sensor inaccuracy, and processing imprecision. In this work, we aim to rigorously benchmark and analyze point cloud classification under corruptions. To conduct a systematic investigation, we first provide a taxonomy of common 3D corruptions and identify the atomic corruptions. Then, we perform a comprehensive evaluation on a wide range of representative point cloud models to understand their robustness and generalizability. Our benchmark results show that although point cloud classification performance improves over time, the state-of-the-art methods are on the verge of being less robust. Based on the obtained observations, we propose several effective techniques to enhance point cloud classifier robustness. We hope our comprehensive benchmark, in-depth analysis, and proposed techniques could spark future research in robust 3D perception.", "bibtex": "@InProceedings{pmlr-v162-ren22c,\n title = \t {Benchmarking and Analyzing Point Cloud Classification under Corruptions},\n author = {Ren, Jiawei and Pan, Liang and Liu, Ziwei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18559--18575},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ren22c/ren22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/ren22c.html},\n abstract = \t {3D perception, especially point cloud classification, has achieved substantial progress. However, in real-world deployment, point cloud corruptions are inevitable due to the scene complexity, sensor inaccuracy, and processing imprecision. In this work, we aim to rigorously benchmark and analyze point cloud classification under corruptions. To conduct a systematic investigation, we first provide a taxonomy of common 3D corruptions and identify the atomic corruptions. Then, we perform a comprehensive evaluation on a wide range of representative point cloud models to understand their robustness and generalizability. Our benchmark results show that although point cloud classification performance improves over time, the state-of-the-art methods are on the verge of being less robust. Based on the obtained observations, we propose several effective techniques to enhance point cloud classifier robustness. We hope our comprehensive benchmark, in-depth analysis, and proposed techniques could spark future research in robust 3D perception.}\n}", "pdf": "https://proceedings.mlr.press/v162/ren22c/ren22c.pdf", "supp": "", "pdf_size": 4162890, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4434116773940428233&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "S-Lab, Nanyang Technological University; S-Lab, Nanyang Technological University; S-Lab, Nanyang Technological University", "aff_domain": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "email": "ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "github": "https://github.com/jiawei-ren/modelnetc", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/ren22c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "S-Lab", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "Benefits of Overparameterized Convolutional Residual Networks: Function Approximation under Smoothness Constraint", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18119", "id": "18119", "proceeding": "https://proceedings.mlr.press/v162/liu22c.html", "poster": "/media/PosterPDFs/ICML%202022/a36b598abb934e4528412e5a2127b931_QUjU99r.png?t=1658113919.6478689", "slides": "/media/icml-2022/Slides/18119.pdf", "author_site": "Hao Liu, Minshuo Chen, Siawpeng Er, Wenjing Liao, Tong Zhang, Tuo Zhao", "author": "Hao Liu; Minshuo Chen; Siawpeng Er; Wenjing Liao; Tong Zhang; Tuo Zhao", "abstract": "Overparameterized neural networks enjoy great representation power on complex data, and more importantly yield sufficiently smooth output, which is crucial to their generalization and robustness. Most existing function approximation theories suggest that with sufficiently many parameters, neural networks can well approximate certain classes of functions in terms of the function value. The neural network themselves, however, can be highly nonsmooth. To bridge this gap, we take convolutional residual networks (ConvResNets) as an example, and prove that large ConvResNets can not only approximate a target function in terms of function value, but also exhibit sufficient first-order smoothness. Moreover, we extend our theory to approximating functions supported on a low-dimensional manifold. Our theory partially justifies the benefits of using deep and wide networks in practice. Numerical experiments on adversarial robust image classification are provided to support our theory.", "bibtex": "@InProceedings{pmlr-v162-liu22c,\n title = \t {Benefits of Overparameterized Convolutional Residual Networks: Function Approximation under Smoothness Constraint},\n author = {Liu, Hao and Chen, Minshuo and Er, Siawpeng and Liao, Wenjing and Zhang, Tong and Zhao, Tuo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13669--13703},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22c/liu22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22c.html},\n abstract = \t {Overparameterized neural networks enjoy great representation power on complex data, and more importantly yield sufficiently smooth output, which is crucial to their generalization and robustness. Most existing function approximation theories suggest that with sufficiently many parameters, neural networks can well approximate certain classes of functions in terms of the function value. The neural network themselves, however, can be highly nonsmooth. To bridge this gap, we take convolutional residual networks (ConvResNets) as an example, and prove that large ConvResNets can not only approximate a target function in terms of function value, but also exhibit sufficient first-order smoothness. Moreover, we extend our theory to approximating functions supported on a low-dimensional manifold. Our theory partially justifies the benefits of using deep and wide networks in practice. Numerical experiments on adversarial robust image classification are provided to support our theory.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22c/liu22c.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/liu22c-supp.zip", "pdf_size": 1136528, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11835684133068489548&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Mathematics, Hong Kong Baptist University; School of Industrial and Systems Engineering, Georgia Institute of Technology; School of Industrial and Systems Engineering, Georgia Institute of Technology; School of Mathematics, Georgia Institute of Technology; Department of Mathematics and Department of Computer Science and Engineering, The Hong Kong University of Science and Technology + Google Research; School of Industrial and Systems Engineering, Georgia Institute of Technology", "aff_domain": "hkbu.edu.hk;gatech.edu;gatech.edu;gatech.edu;cse.ust.hk;gatech.edu", "email": "hkbu.edu.hk;gatech.edu;gatech.edu;gatech.edu;cse.ust.hk;gatech.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/liu22c.html", "aff_unique_index": "0;1;1;1;2+3;1", "aff_unique_norm": "Hong Kong Baptist University;Georgia Institute of Technology;Hong Kong University of Science and Technology;Google", "aff_unique_dep": "Department of Mathematics;School of Industrial and Systems Engineering;Department of Mathematics;Google Research", "aff_unique_url": "https://www.hkbu.edu.hk;https://www.gatech.edu;https://www.ust.hk;https://research.google", "aff_unique_abbr": "HKBU;Georgia Tech;HKUST;Google Research", "aff_campus_unique_index": "0;1;1;1;0+2;1", "aff_campus_unique": "Hong Kong SAR;Atlanta;Mountain View", "aff_country_unique_index": "0;1;1;1;0+1;1", "aff_country_unique": "China;United States" }, { "title": "Beyond Images: Label Noise Transition Matrix Estimation for Tasks with Lower-Quality Features", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17067", "id": "17067", "proceeding": "https://proceedings.mlr.press/v162/zhu22k.html", "poster": "/media/PosterPDFs/ICML%202022/8303a79b1e19a194f1875981be5bdb6f.png?t=1658242887.0992785", "slides": "", "author_site": "Zhaowei Zhu, Jialu Wang, Yang Liu", "author": "Zhaowei Zhu; Jialu Wang; Yang Liu", "abstract": "The label noise transition matrix, denoting the transition probabilities from clean labels to noisy labels, is crucial for designing statistically robust solutions. Existing estimators for noise transition matrices, e.g., using either anchor points or clusterability, focus on computer vision tasks that are relatively easier to obtain high-quality representations. We observe that tasks with lower-quality features fail to meet the anchor-point or clusterability condition, due to the coexistence of both uninformative and informative representations. To handle this issue, we propose a generic and practical information-theoretic approach to down-weight the less informative parts of the lower-quality features. This improvement is crucial to identifying and estimating the label noise transition matrix. The salient technical challenge is to compute the relevant information-theoretical metrics using only noisy labels instead of clean ones. We prove that the celebrated $f$-mutual information measure can often preserve the order when calculated using noisy labels. We then build our transition matrix estimator using this distilled version of features. The necessity and effectiveness of the proposed method are also demonstrated by evaluating the estimation error on a varied set of tabular data and text classification tasks with lower-quality features. Code is available at github.com/UCSC-REAL/BeyondImages.", "bibtex": "@InProceedings{pmlr-v162-zhu22k,\n title = \t {Beyond Images: Label Noise Transition Matrix Estimation for Tasks with Lower-Quality Features},\n author = {Zhu, Zhaowei and Wang, Jialu and Liu, Yang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27633--27653},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhu22k/zhu22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhu22k.html},\n abstract = \t {The label noise transition matrix, denoting the transition probabilities from clean labels to noisy labels, is crucial for designing statistically robust solutions. Existing estimators for noise transition matrices, e.g., using either anchor points or clusterability, focus on computer vision tasks that are relatively easier to obtain high-quality representations. We observe that tasks with lower-quality features fail to meet the anchor-point or clusterability condition, due to the coexistence of both uninformative and informative representations. To handle this issue, we propose a generic and practical information-theoretic approach to down-weight the less informative parts of the lower-quality features. This improvement is crucial to identifying and estimating the label noise transition matrix. The salient technical challenge is to compute the relevant information-theoretical metrics using only noisy labels instead of clean ones. We prove that the celebrated $f$-mutual information measure can often preserve the order when calculated using noisy labels. We then build our transition matrix estimator using this distilled version of features. The necessity and effectiveness of the proposed method are also demonstrated by evaluating the estimation error on a varied set of tabular data and text classification tasks with lower-quality features. Code is available at github.com/UCSC-REAL/BeyondImages.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhu22k/zhu22k.pdf", "supp": "", "pdf_size": 607510, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16994100334139808919&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science and Engineering, University of California, Santa Cruz, CA, USA; Department of Computer Science and Engineering, University of California, Santa Cruz, CA, USA; Department of Computer Science and Engineering, University of California, Santa Cruz, CA, USA", "aff_domain": "ucsc.edu;ucsc.edu;ucsc.edu", "email": "ucsc.edu;ucsc.edu;ucsc.edu", "github": "github.com/UCSC-REAL/BeyondImages", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhu22k.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Santa Cruz", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.ucsc.edu", "aff_unique_abbr": "UCSC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Santa Cruz", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Beyond Worst-Case Analysis in Stochastic Approximation: Moment Estimation Improves Instance Complexity", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16567", "id": "16567", "proceeding": "https://proceedings.mlr.press/v162/zhang22r.html", "poster": "", "slides": "", "author_site": "Jingzhao Zhang, Hongzhou Lin, Subhro Das, Suvrit Sra, Ali Jadbabaie", "author": "Jingzhao Zhang; Hongzhou Lin; Subhro Das; Suvrit Sra; Ali Jadbabaie", "abstract": "We study oracle complexity of gradient based methods for stochastic approximation problems. Though in many settings optimal algorithms and tight lower bounds are known for such problems, these optimal algorithms do not achieve the best performance when used in practice. We address this theory-practice gap by focusing on", "bibtex": "@InProceedings{pmlr-v162-zhang22r,\n title = \t {Beyond Worst-Case Analysis in Stochastic Approximation: Moment Estimation Improves Instance Complexity},\n author = {Zhang, Jingzhao and Lin, Hongzhou and Das, Subhro and Sra, Suvrit and Jadbabaie, Ali},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26347--26361},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22r/zhang22r.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22r.html},\n abstract = \t {We study oracle complexity of gradient based methods for stochastic approximation problems. Though in many settings optimal algorithms and tight lower bounds are known for such problems, these optimal algorithms do not achieve the best performance when used in practice. We address this theory-practice gap by focusing on", "pdf": "https://proceedings.mlr.press/v162/zhang22r/zhang22r.pdf", "supp": "", "pdf_size": 968613, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5194846270771140561&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "IIIS, Tsinghua University; Amazon; MIT-IBM Watson AI Lab, IBM Research; Massachusetts Institute of Technology; Massachusetts Institute of Technology", "aff_domain": "mail.tsinghua.edu.cn; ; ; ; ", "email": "mail.tsinghua.edu.cn; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zhang22r.html", "aff_unique_index": "0;1;2;3;3", "aff_unique_norm": "Tsinghua University;Amazon;IBM;Massachusetts Institute of Technology", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;Amazon.com, Inc.;AI Lab;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.amazon.com;https://www.ibmwatsonai.org/;https://web.mit.edu", "aff_unique_abbr": "THU;Amazon;MIT-IBM AI Lab;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "China;United States" }, { "title": "Biased Gradient Estimate with Drastic Variance Reduction for Meta Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16369", "id": "16369", "proceeding": "https://proceedings.mlr.press/v162/tang22a.html", "poster": "", "slides": "", "author": "Yunhao Tang", "abstract": "Despite the empirical success of meta reinforcement learning (meta-RL), there are still a number poorly-understood discrepancies between theory and practice. Critically, biased gradient estimates are almost always implemented in practice, whereas prior theory on meta-RL only establishes convergence under unbiased gradient estimates. In this work, we investigate such a discrepancy. In particular, (1) We show that unbiased gradient estimates have variance $\\Theta(N)$ which linearly depends on the sample size $N$ of the inner loop updates; (2) We propose linearized score function (LSF) gradient estimates, which have bias $\\mathcal{O}(1/\\sqrt{N})$ and variance $\\mathcal{O}(1/N)$; (3) We show that most empirical prior work in fact implements variants of the LSF gradient estimates. This implies that practical algorithms \"accidentally\" introduce bias to achieve better performance; (4) We establish theoretical guarantees for the LSF gradient estimates in meta-RL regarding its convergence to stationary points, showing better dependency on $N$ than prior work when $N$ is large.", "bibtex": "@InProceedings{pmlr-v162-tang22a,\n title = \t {Biased Gradient Estimate with Drastic Variance Reduction for Meta Reinforcement Learning},\n author = {Tang, Yunhao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21050--21075},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tang22a/tang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tang22a.html},\n abstract = \t {Despite the empirical success of meta reinforcement learning (meta-RL), there are still a number poorly-understood discrepancies between theory and practice. Critically, biased gradient estimates are almost always implemented in practice, whereas prior theory on meta-RL only establishes convergence under unbiased gradient estimates. In this work, we investigate such a discrepancy. In particular, (1) We show that unbiased gradient estimates have variance $\\Theta(N)$ which linearly depends on the sample size $N$ of the inner loop updates; (2) We propose linearized score function (LSF) gradient estimates, which have bias $\\mathcal{O}(1/\\sqrt{N})$ and variance $\\mathcal{O}(1/N)$; (3) We show that most empirical prior work in fact implements variants of the LSF gradient estimates. This implies that practical algorithms \"accidentally\" introduce bias to achieve better performance; (4) We establish theoretical guarantees for the LSF gradient estimates in meta-RL regarding its convergence to stationary points, showing better dependency on $N$ than prior work when $N$ is large.}\n}", "pdf": "https://proceedings.mlr.press/v162/tang22a/tang22a.pdf", "supp": "", "pdf_size": 681761, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7398034372847722269&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": "DeepMind", "aff_domain": "deepmind.com", "email": "deepmind.com", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/tang22a.html", "aff_unique_index": "0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "Biological Sequence Design with GFlowNets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17337", "id": "17337", "proceeding": "https://proceedings.mlr.press/v162/jain22a.html", "poster": "/media/PosterPDFs/ICML%202022/2327fdecafc97928d5ba62af00a05704_nAgr1l8.png?t=1657661612.7752044", "slides": "", "author_site": "Moksh Jain, Emmanuel Bengio, Alex Hernandez-Garcia, Jarrid Rector-Brooks, Bonaventure Dossou, Chanakya Ekbote, Jie Fu, Tianyu Zhang, Michael Kilgour, Dinghuai Zhang, Lena Simine, Payel Das, Yoshua Bengio", "author": "Moksh Jain; Emmanuel Bengio; Alex Hernandez-Garcia; Jarrid Rector-Brooks; Bonaventure F. P. Dossou; Chanakya Ajit Ekbote; Jie Fu; Tianyu Zhang; Michael Kilgour; Dinghuai Zhang; Lena Simine; Payel Das; Yoshua Bengio", "abstract": "Design of de novo biological sequences with desired properties, like protein and DNA sequences, often involves an active loop with several rounds of molecule ideation and expensive wet-lab evaluations. These experiments can consist of multiple stages, with increasing levels of precision and cost of evaluation, where candidates are filtered. This makes the diversity of proposed candidates a key consideration in the ideation phase. In this work, we propose an active learning algorithm leveraging epistemic uncertainty estimation and the recently proposed GFlowNets as a generator of diverse candidate solutions, with the objective to obtain a diverse batch of useful (as defined by some utility function, for example, the predicted anti-microbial activity of a peptide) and informative candidates after each round. We also propose a scheme to incorporate existing labeled datasets of candidates, in addition to a reward function, to speed up learning in GFlowNets. We present empirical results on several biological sequence design tasks, and we find that our method generates more diverse and novel batches with high scoring candidates compared to existing approaches.", "bibtex": "@InProceedings{pmlr-v162-jain22a,\n title = \t {Biological Sequence Design with {GF}low{N}ets},\n author = {Jain, Moksh and Bengio, Emmanuel and Hernandez-Garcia, Alex and Rector-Brooks, Jarrid and Dossou, Bonaventure F. P. and Ekbote, Chanakya Ajit and Fu, Jie and Zhang, Tianyu and Kilgour, Michael and Zhang, Dinghuai and Simine, Lena and Das, Payel and Bengio, Yoshua},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9786--9801},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jain22a/jain22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jain22a.html},\n abstract = \t {Design of de novo biological sequences with desired properties, like protein and DNA sequences, often involves an active loop with several rounds of molecule ideation and expensive wet-lab evaluations. These experiments can consist of multiple stages, with increasing levels of precision and cost of evaluation, where candidates are filtered. This makes the diversity of proposed candidates a key consideration in the ideation phase. In this work, we propose an active learning algorithm leveraging epistemic uncertainty estimation and the recently proposed GFlowNets as a generator of diverse candidate solutions, with the objective to obtain a diverse batch of useful (as defined by some utility function, for example, the predicted anti-microbial activity of a peptide) and informative candidates after each round. We also propose a scheme to incorporate existing labeled datasets of candidates, in addition to a reward function, to speed up learning in GFlowNets. We present empirical results on several biological sequence design tasks, and we find that our method generates more diverse and novel batches with high scoring candidates compared to existing approaches.}\n}", "pdf": "https://proceedings.mlr.press/v162/jain22a/jain22a.pdf", "supp": "", "pdf_size": 599439, "gs_citation": 203, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13153301030980981497&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Mila+Universit \u00b4e de Montr \u00b4eal; Mila+Universit \u00b4e de Montr \u00b4eal; Mila+Universit \u00b4e de Montr \u00b4eal; Mila+Universit \u00b4e de Montr \u00b4eal; Mila+Universit \u00b4e de Montr \u00b4eal+Jacobs University Bremen; Mila; Mila+Universit \u00b4e de Montr \u00b4eal; Mila+Universit \u00b4e de Montr \u00b4eal; New York University; Mila+Universit \u00b4e de Montr \u00b4eal; McGill University; IBM; Mila+Universit \u00b4e de Montr \u00b4eal+CIFAR Fellow and AI Chair", "aff_domain": "gmail.com; ; ; ; ; ; ; ; ; ; ; ;", "email": "gmail.com; ; ; ; ; ; ; ; ; ; ; ;", "github": "", "project": "", "author_num": 13, "oa": "https://proceedings.mlr.press/v162/jain22a.html", "aff_unique_index": "0+1;0+1;0+1;0+1;0+1+2;0;0+1;0+1;3;0+1;4;5;0+1+6", "aff_unique_norm": "Mila;Universit\u00e9 de Montr\u00e9al;Jacobs University;New York University;McGill University;International Business Machines Corporation;CIFAR", "aff_unique_dep": "Quebec Artificial Intelligence Institute;;;;;;AI", "aff_unique_url": "https://mila.quebec;https://www.umontreal.ca;https://www.jacobs-university.de;https://www.nyu.edu;https://www.mcgill.ca;https://www.ibm.com;https://www.cifar.ca", "aff_unique_abbr": "Mila;UdeM;JUB;NYU;McGill;IBM;CIFAR", "aff_campus_unique_index": ";;;;1;;;;", "aff_campus_unique": ";Bremen", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0+1;0;0+0;0+0;2;0+0;0;2;0+0+0", "aff_country_unique": "Canada;Germany;United States" }, { "title": "Bisimulation Makes Analogies in Goal-Conditioned Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17501", "id": "17501", "proceeding": "https://proceedings.mlr.press/v162/hansen-estruch22a.html", "poster": "/media/PosterPDFs/ICML%202022/24389bfe4fe2eba8bf9aa9203a44cdad_Ihoi17m.png?t=1657510533.467334", "slides": "", "author_site": "Philippe Hansen-Estruch, Amy Zhang, Ashvin Nair, Patrick Yin, Sergey Levine", "author": "Philippe Hansen-Estruch; Amy Zhang; Ashvin Nair; Patrick Yin; Sergey Levine", "abstract": "Building generalizable goal-conditioned agents from rich observations is a key to reinforcement learning (RL) solving real world problems. Traditionally in goal-conditioned RL, an agent is provided with the exact goal they intend to reach. However, it is often not realistic to know the configuration of the goal before performing a task. A more scalable framework would allow us to provide the agent with an example of an analogous task, and have the agent then infer what the goal should be for its current state. We propose a new form of state abstraction called goal-conditioned bisimulation that captures functional equivariance, allowing for the reuse of skills to achieve new goals. We learn this representation using a metric form of this abstraction, and show its ability to generalize to new goals in real world manipulation tasks. Further, we prove that this learned representation is sufficient not only for goal-conditioned tasks, but is amenable to any downstream task described by a state-only reward function.", "bibtex": "@InProceedings{pmlr-v162-hansen-estruch22a,\n title = \t {Bisimulation Makes Analogies in Goal-Conditioned Reinforcement Learning},\n author = {Hansen-Estruch, Philippe and Zhang, Amy and Nair, Ashvin and Yin, Patrick and Levine, Sergey},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8407--8426},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hansen-estruch22a/hansen-estruch22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hansen-estruch22a.html},\n abstract = \t {Building generalizable goal-conditioned agents from rich observations is a key to reinforcement learning (RL) solving real world problems. Traditionally in goal-conditioned RL, an agent is provided with the exact goal they intend to reach. However, it is often not realistic to know the configuration of the goal before performing a task. A more scalable framework would allow us to provide the agent with an example of an analogous task, and have the agent then infer what the goal should be for its current state. We propose a new form of state abstraction called goal-conditioned bisimulation that captures functional equivariance, allowing for the reuse of skills to achieve new goals. We learn this representation using a metric form of this abstraction, and show its ability to generalize to new goals in real world manipulation tasks. Further, we prove that this learned representation is sufficient not only for goal-conditioned tasks, but is amenable to any downstream task described by a state-only reward function.}\n}", "pdf": "https://proceedings.mlr.press/v162/hansen-estruch22a/hansen-estruch22a.pdf", "supp": "", "pdf_size": 2347283, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16823375554468751423&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of California, Berkeley; University of California, Berkeley + Meta AI Research; University of California, Berkeley; University of California, Berkeley; University of California, Berkeley", "aff_domain": "berkeley.edu;fb.com; ; ; ", "email": "berkeley.edu;fb.com; ; ; ", "github": "", "project": "https://sites.google.com/view/gc-bisimulation", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/hansen-estruch22a.html", "aff_unique_index": "0;0+1;0;0;0", "aff_unique_norm": "University of California, Berkeley;Meta", "aff_unique_dep": ";Meta AI Research", "aff_unique_url": "https://www.berkeley.edu;https://meta.com", "aff_unique_abbr": "UC Berkeley;Meta AI", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bit Prioritization in Variational Autoencoders via Progressive Coding", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17023", "id": "17023", "proceeding": "https://proceedings.mlr.press/v162/shu22a.html", "poster": "", "slides": "", "author_site": "Rui Shu, Stefano Ermon", "author": "Rui Shu; Stefano Ermon", "abstract": "The hierarchical variational autoencoder (HVAE) is a popular generative model used for many representation learning tasks. However, its application to image synthesis often yields models with poor sample quality. In this work, we treat image synthesis itself as a hierarchical representation learning problem and regularize an HVAE toward representations that improve the model\u2019s image synthesis performance. We do so by leveraging the progressive coding hypothesis, which claims hierarchical latent variable models that are good at progressive lossy compression will generate high-quality samples. To test this hypothesis, we first show empirically that conventionally-trained HVAEs are not good progressive coders. We then propose a simple method that constrains the hierarchical representations to prioritize the encoding of information beneficial for lossy compression, and show that this modification leads to improved sample quality. Our work lends further support to the progressive coding hypothesis and demonstrates that this hypothesis should be exploited when designing variational autoencoders.", "bibtex": "@InProceedings{pmlr-v162-shu22a,\n title = \t {Bit Prioritization in Variational Autoencoders via Progressive Coding},\n author = {Shu, Rui and Ermon, Stefano},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20141--20155},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shu22a/shu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/shu22a.html},\n abstract = \t {The hierarchical variational autoencoder (HVAE) is a popular generative model used for many representation learning tasks. However, its application to image synthesis often yields models with poor sample quality. In this work, we treat image synthesis itself as a hierarchical representation learning problem and regularize an HVAE toward representations that improve the model\u2019s image synthesis performance. We do so by leveraging the progressive coding hypothesis, which claims hierarchical latent variable models that are good at progressive lossy compression will generate high-quality samples. To test this hypothesis, we first show empirically that conventionally-trained HVAEs are not good progressive coders. We then propose a simple method that constrains the hierarchical representations to prioritize the encoding of information beneficial for lossy compression, and show that this modification leads to improved sample quality. Our work lends further support to the progressive coding hypothesis and demonstrates that this hypothesis should be exploited when designing variational autoencoders.}\n}", "pdf": "https://proceedings.mlr.press/v162/shu22a/shu22a.pdf", "supp": "", "pdf_size": 15156471, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15842218078313996238&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Stanford Computer Science; Stanford Computer Science", "aff_domain": "stanford.edu; ", "email": "stanford.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/shu22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Bitwidth Heterogeneous Federated Learning with Progressive Weight Dequantization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18265", "id": "18265", "proceeding": "https://proceedings.mlr.press/v162/yoon22a.html", "poster": "/media/PosterPDFs/ICML%202022/1c383cd30b7c298ab50293adfecb7b18.png?t=1657528350.6251407", "slides": "", "author_site": "Jaehong Yoon, Geon Park, Wonyong Jeong, Sung Ju Hwang", "author": "Jaehong Yoon; Geon Park; Wonyong Jeong; Sung Ju Hwang", "abstract": "In practical federated learning scenarios, the participating devices may have different bitwidths for computation and memory storage by design. However, despite the progress made in device-heterogeneous federated learning scenarios, the heterogeneity in the bitwidth specifications in the hardware has been mostly overlooked. We introduce a pragmatic FL scenario with bitwidth heterogeneity across the participating devices, dubbed as Bitwidth Heterogeneous Federated Learning (BHFL). BHFL brings in a new challenge, that the aggregation of model parameters with different bitwidths could result in severe performance degeneration, especially for high-bitwidth models. To tackle this problem, we propose ProWD framework, which has a trainable weight dequantizer at the central server that progressively reconstructs the low-bitwidth weights into higher bitwidth weights, and finally into full-precision weights. ProWD further selectively aggregates the model parameters to maximize the compatibility across bit-heterogeneous weights. We validate ProWD against relevant FL baselines on the benchmark datasets, using clients with varying bitwidths. Our ProWD largely outperforms the baseline FL algorithms as well as naive approaches (e.g. grouped averaging) under the proposed BHFL scenario.", "bibtex": "@InProceedings{pmlr-v162-yoon22a,\n title = \t {Bitwidth Heterogeneous Federated Learning with Progressive Weight Dequantization},\n author = {Yoon, Jaehong and Park, Geon and Jeong, Wonyong and Hwang, Sung Ju},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25552--25565},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yoon22a/yoon22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yoon22a.html},\n abstract = \t {In practical federated learning scenarios, the participating devices may have different bitwidths for computation and memory storage by design. However, despite the progress made in device-heterogeneous federated learning scenarios, the heterogeneity in the bitwidth specifications in the hardware has been mostly overlooked. We introduce a pragmatic FL scenario with bitwidth heterogeneity across the participating devices, dubbed as Bitwidth Heterogeneous Federated Learning (BHFL). BHFL brings in a new challenge, that the aggregation of model parameters with different bitwidths could result in severe performance degeneration, especially for high-bitwidth models. To tackle this problem, we propose ProWD framework, which has a trainable weight dequantizer at the central server that progressively reconstructs the low-bitwidth weights into higher bitwidth weights, and finally into full-precision weights. ProWD further selectively aggregates the model parameters to maximize the compatibility across bit-heterogeneous weights. We validate ProWD against relevant FL baselines on the benchmark datasets, using clients with varying bitwidths. Our ProWD largely outperforms the baseline FL algorithms as well as naive approaches (e.g. grouped averaging) under the proposed BHFL scenario.}\n}", "pdf": "https://proceedings.mlr.press/v162/yoon22a/yoon22a.pdf", "supp": "", "pdf_size": 1496114, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=129382884452382802&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Korea Advanced Institute of Science and Technology (KAIST), South Korea; Korea Advanced Institute of Science and Technology (KAIST), South Korea; Korea Advanced Institute of Science and Technology (KAIST), South Korea; Korea Advanced Institute of Science and Technology (KAIST), South Korea+AITRICS, South Korea", "aff_domain": "kaist.ac.kr;kaist.ac.kr; ;kaist.ac.kr", "email": "kaist.ac.kr;kaist.ac.kr; ;kaist.ac.kr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/yoon22a.html", "aff_unique_index": "0;0;0;0+1", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;AITRICS", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;", "aff_unique_abbr": "KAIST;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", "aff_country_unique": "South Korea" }, { "title": "Black-Box Tuning for Language-Model-as-a-Service", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17783", "id": "17783", "proceeding": "https://proceedings.mlr.press/v162/sun22e.html", "poster": "/media/PosterPDFs/ICML%202022/ef575e8837d065a1683c022d2077d342_GbkiyvM.png?t=1657548530.3049085", "slides": "", "author_site": "Tianxiang Sun, Yunfan Shao, Hong Qian, Xuanjing Huang, Xipeng Qiu", "author": "Tianxiang Sun; Yunfan Shao; Hong Qian; Xuanjing Huang; Xipeng Qiu", "abstract": "Extremely large pre-trained language models (PTMs) such as GPT-3 are usually released as a service. It allows users to design task-specific prompts to query the PTMs through some black-box APIs. In such a scenario, which we call Language-Model-as-a-Service (LMaaS), the gradients of PTMs are usually unavailable. Can we optimize the task prompts by only accessing the model inference APIs? This paper proposes the black-box tuning framework to optimize the continuous prompt prepended to the input text via derivative-free optimization. Instead of optimizing in the original high-dimensional prompt space, which is intractable for traditional derivative-free optimization, we perform optimization in a randomly generated subspace due to the low intrinsic dimensionality of large PTMs. The experimental results show that the black-box tuning with RoBERTa on a few labeled samples not only significantly outperforms manual prompt and GPT-3\u2019s in-context learning, but also surpasses the gradient-based counterparts, i.e., prompt tuning and full model tuning.", "bibtex": "@InProceedings{pmlr-v162-sun22e,\n title = \t {Black-Box Tuning for Language-Model-as-a-Service},\n author = {Sun, Tianxiang and Shao, Yunfan and Qian, Hong and Huang, Xuanjing and Qiu, Xipeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20841--20855},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sun22e/sun22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/sun22e.html},\n abstract = \t {Extremely large pre-trained language models (PTMs) such as GPT-3 are usually released as a service. It allows users to design task-specific prompts to query the PTMs through some black-box APIs. In such a scenario, which we call Language-Model-as-a-Service (LMaaS), the gradients of PTMs are usually unavailable. Can we optimize the task prompts by only accessing the model inference APIs? This paper proposes the black-box tuning framework to optimize the continuous prompt prepended to the input text via derivative-free optimization. Instead of optimizing in the original high-dimensional prompt space, which is intractable for traditional derivative-free optimization, we perform optimization in a randomly generated subspace due to the low intrinsic dimensionality of large PTMs. The experimental results show that the black-box tuning with RoBERTa on a few labeled samples not only significantly outperforms manual prompt and GPT-3\u2019s in-context learning, but also surpasses the gradient-based counterparts, i.e., prompt tuning and full model tuning.}\n}", "pdf": "https://proceedings.mlr.press/v162/sun22e/sun22e.pdf", "supp": "", "pdf_size": 1305439, "gs_citation": 288, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6566630989334663783&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Fudan University; Fudan University; East China Normal University; Fudan University; Fudan University+Peng Cheng Laboratory", "aff_domain": "fudan.edu.cn; ; ;fudan.edu.cn; ", "email": "fudan.edu.cn; ; ;fudan.edu.cn; ", "github": "", "project": "https://gpt3demo.com/", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/sun22e.html", "aff_unique_index": "0;0;1;0;0+2", "aff_unique_norm": "Fudan University;East China Normal University;Pengcheng Laboratory", "aff_unique_dep": ";;Peng Cheng Laboratory", "aff_unique_url": "https://www.fudan.edu.cn;http://www.ecnu.edu.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "Fudan;ECNU;PCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0", "aff_country_unique": "China" }, { "title": "Blocks Assemble! Learning to Assemble with Large-Scale Structured Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17383", "id": "17383", "proceeding": "https://proceedings.mlr.press/v162/ghasemipour22a.html", "poster": "", "slides": "", "author_site": "Seyed Kamyar Seyed Ghasemipour, Satoshi Kataoka, Byron David, Daniel Freeman, Shixiang Gu, Igor Mordatch", "author": "Seyed Kamyar Seyed Ghasemipour; Satoshi Kataoka; Byron David; Daniel Freeman; Shixiang Shane Gu; Igor Mordatch", "abstract": "Assembly of multi-part physical structures is both a valuable end product for autonomous robotics, as well as a valuable diagnostic task for open-ended training of embodied intelligent agents. We introduce a naturalistic physics-based environment with a set of connectable magnet blocks inspired by children\u2019s toy kits. The objective is to assemble blocks into a succession of target blueprints. Despite the simplicity of this objective, the compositional nature of building diverse blueprints from a set of blocks leads to an explosion of complexity in structures that agents encounter. Furthermore, assembly stresses agents\u2019 multi-step planning, physical reasoning, and bimanual coordination. We find that the combination of large-scale reinforcement learning and graph-based policies \u2013 surprisingly without any additional complexity \u2013 is an effective recipe for training agents that not only generalize to complex unseen blueprints in a zero-shot manner, but even operate in a reset-free setting without being trained to do so. Through extensive experiments, we highlight the importance of large-scale training, structured representations, contributions of multi-task vs. single-task learning, as well as the effects of curriculums, and discuss qualitative behaviors of trained agents. Our accompanying project webpage can be found at: https://sites.google.com/view/learning-direct-assembly/home", "bibtex": "@InProceedings{pmlr-v162-ghasemipour22a,\n title = \t {Blocks Assemble! {L}earning to Assemble with Large-Scale Structured Reinforcement Learning},\n author = {Ghasemipour, Seyed Kamyar Seyed and Kataoka, Satoshi and David, Byron and Freeman, Daniel and Gu, Shixiang Shane and Mordatch, Igor},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7435--7469},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ghasemipour22a/ghasemipour22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ghasemipour22a.html},\n abstract = \t {Assembly of multi-part physical structures is both a valuable end product for autonomous robotics, as well as a valuable diagnostic task for open-ended training of embodied intelligent agents. We introduce a naturalistic physics-based environment with a set of connectable magnet blocks inspired by children\u2019s toy kits. The objective is to assemble blocks into a succession of target blueprints. Despite the simplicity of this objective, the compositional nature of building diverse blueprints from a set of blocks leads to an explosion of complexity in structures that agents encounter. Furthermore, assembly stresses agents\u2019 multi-step planning, physical reasoning, and bimanual coordination. We find that the combination of large-scale reinforcement learning and graph-based policies \u2013 surprisingly without any additional complexity \u2013 is an effective recipe for training agents that not only generalize to complex unseen blueprints in a zero-shot manner, but even operate in a reset-free setting without being trained to do so. Through extensive experiments, we highlight the importance of large-scale training, structured representations, contributions of multi-task vs. single-task learning, as well as the effects of curriculums, and discuss qualitative behaviors of trained agents. Our accompanying project webpage can be found at: https://sites.google.com/view/learning-direct-assembly/home}\n}", "pdf": "https://proceedings.mlr.press/v162/ghasemipour22a/ghasemipour22a.pdf", "supp": "", "pdf_size": 4459535, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13747687000184012990&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": "Google Research; Google Research; Google Research; Google Research; Google Research; Google Research", "aff_domain": "google.com; ; ; ; ; ", "email": "google.com; ; ; ; ; ", "github": "", "project": "sites.google.com/view/learning-direct-assembly", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/ghasemipour22a.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Blurs Behave Like Ensembles: Spatial Smoothings to Improve Accuracy, Uncertainty, and Robustness", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17459", "id": "17459", "proceeding": "https://proceedings.mlr.press/v162/park22b.html", "poster": "/media/PosterPDFs/ICML%202022/8d7d8ee069cb0cbbf816bbb65d56947e_KGOAkQu.png?t=1657652247.8114235", "slides": "", "author_site": "Namuk Park, Songkuk Kim", "author": "Namuk Park; Songkuk Kim", "abstract": "Neural network ensembles, such as Bayesian neural networks (BNNs), have shown success in the areas of uncertainty estimation and robustness. However, a crucial challenge prohibits their use in practice. BNNs require a large number of predictions to produce reliable results, leading to a significant increase in computational cost. To alleviate this issue, we propose spatial smoothing, a method that ensembles neighboring feature map points of convolutional neural networks. By simply adding a few blur layers to the models, we empirically show that spatial smoothing improves accuracy, uncertainty estimation, and robustness of BNNs across a whole range of ensemble sizes. In particular, BNNs incorporating spatial smoothing achieve high predictive performance merely with a handful of ensembles. Moreover, this method also can be applied to canonical deterministic neural networks to improve the performances. A number of evidences suggest that the improvements can be attributed to the stabilized feature maps and the smoothing of the loss landscape. In addition, we provide a fundamental explanation for prior works {\u2014} namely, global average pooling, pre-activation, and ReLU6 {\u2014} by addressing them as special cases of spatial smoothing. These not only enhance accuracy, but also improve uncertainty estimation and robustness by making the loss landscape smoother in the same manner as spatial smoothing. The code is available at https://github.com/xxxnell/spatial-smoothing.", "bibtex": "@InProceedings{pmlr-v162-park22b,\n title = \t {Blurs Behave Like Ensembles: Spatial Smoothings to Improve Accuracy, Uncertainty, and Robustness},\n author = {Park, Namuk and Kim, Songkuk},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17390--17419},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/park22b/park22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/park22b.html},\n abstract = \t {Neural network ensembles, such as Bayesian neural networks (BNNs), have shown success in the areas of uncertainty estimation and robustness. However, a crucial challenge prohibits their use in practice. BNNs require a large number of predictions to produce reliable results, leading to a significant increase in computational cost. To alleviate this issue, we propose spatial smoothing, a method that ensembles neighboring feature map points of convolutional neural networks. By simply adding a few blur layers to the models, we empirically show that spatial smoothing improves accuracy, uncertainty estimation, and robustness of BNNs across a whole range of ensemble sizes. In particular, BNNs incorporating spatial smoothing achieve high predictive performance merely with a handful of ensembles. Moreover, this method also can be applied to canonical deterministic neural networks to improve the performances. A number of evidences suggest that the improvements can be attributed to the stabilized feature maps and the smoothing of the loss landscape. In addition, we provide a fundamental explanation for prior works {\u2014} namely, global average pooling, pre-activation, and ReLU6 {\u2014} by addressing them as special cases of spatial smoothing. These not only enhance accuracy, but also improve uncertainty estimation and robustness by making the loss landscape smoother in the same manner as spatial smoothing. The code is available at https://github.com/xxxnell/spatial-smoothing.}\n}", "pdf": "https://proceedings.mlr.press/v162/park22b/park22b.pdf", "supp": "", "pdf_size": 2819716, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11971703868153296298&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "NAVER AI Lab; Yonsei University", "aff_domain": "navercorp.com;yonsei.ac.kr", "email": "navercorp.com;yonsei.ac.kr", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/park22b.html", "aff_unique_index": "0;1", "aff_unique_norm": "NAVER Corporation;Yonsei University", "aff_unique_dep": "NAVER AI Lab;", "aff_unique_url": "https://www.naver.com;https://www.yonsei.ac.kr", "aff_unique_abbr": "NAVER;Yonsei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Boosting Graph Structure Learning with Dummy Nodes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17951", "id": "17951", "proceeding": "https://proceedings.mlr.press/v162/liu22d.html", "poster": "/media/PosterPDFs/ICML%202022/afecc60f82be41c1b52f6705ec69e0f1_w9LETip.png?t=1656658572.6053572", "slides": "/media/icml-2022/Slides/17951_YMFKZIN.pdf", "author_site": "Xin Liu, Jiayang Cheng, Yangqiu Song, Xin Jiang", "author": "Xin Liu; Jiayang Cheng; Yangqiu Song; Xin Jiang", "abstract": "With the development of graph kernels and graph representation learning, many superior methods have been proposed to handle scalability and oversmoothing issues on graph structure learning. However, most of those strategies are designed based on practical experience rather than theoretical analysis. In this paper, we use a particular dummy node connecting to all existing vertices without affecting original vertex and edge properties. We further prove that such the dummy node can help build an efficient monomorphic edge-to-vertex transform and an epimorphic inverse to recover the original graph back. It also indicates that adding dummy nodes can preserve local and global structures for better graph representation learning. We extend graph kernels and graph neural networks with dummy nodes and conduct experiments on graph classification and subgraph isomorphism matching tasks. Empirical results demonstrate that taking graphs with dummy nodes as input significantly boosts graph structure learning, and using their edge-to-vertex graphs can also achieve similar results. We also discuss the gain of expressive power from the dummy in neural networks.", "bibtex": "@InProceedings{pmlr-v162-liu22d,\n title = \t {Boosting Graph Structure Learning with Dummy Nodes},\n author = {Liu, Xin and Cheng, Jiayang and Song, Yangqiu and Jiang, Xin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13704--13716},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22d/liu22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22d.html},\n abstract = \t {With the development of graph kernels and graph representation learning, many superior methods have been proposed to handle scalability and oversmoothing issues on graph structure learning. However, most of those strategies are designed based on practical experience rather than theoretical analysis. In this paper, we use a particular dummy node connecting to all existing vertices without affecting original vertex and edge properties. We further prove that such the dummy node can help build an efficient monomorphic edge-to-vertex transform and an epimorphic inverse to recover the original graph back. It also indicates that adding dummy nodes can preserve local and global structures for better graph representation learning. We extend graph kernels and graph neural networks with dummy nodes and conduct experiments on graph classification and subgraph isomorphism matching tasks. Empirical results demonstrate that taking graphs with dummy nodes as input significantly boosts graph structure learning, and using their edge-to-vertex graphs can also achieve similar results. We also discuss the gain of expressive power from the dummy in neural networks.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22d/liu22d.pdf", "supp": "", "pdf_size": 3033424, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11720456442737654498&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science and Engineering, Hong Kong University of Science and Technology, Hong Kong SAR, China+Huawei Noah\u2019s Ark Lab, Hong Kong SAR, China; Department of Computer Science and Engineering, Hong Kong University of Science and Technology, Hong Kong SAR, China; Department of Computer Science and Engineering, Hong Kong University of Science and Technology, Hong Kong SAR, China; Huawei Noah\u2019s Ark Lab, Hong Kong SAR, China", "aff_domain": "cse.ust.hk; ; ; ", "email": "cse.ust.hk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/liu22d.html", "aff_unique_index": "0+1;0;0;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Huawei", "aff_unique_dep": "Department of Computer Science and Engineering;Huawei Noah\u2019s Ark Lab", "aff_unique_url": "https://www.ust.hk;https://www.huawei.com/en/ai/noahs-ark-lab", "aff_unique_abbr": "HKUST;Huawei Noah\u2019s Ark Lab", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong;", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "China" }, { "title": "Born-Infeld (BI) for AI: Energy-Conserving Descent (ECD) for Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16039", "id": "16039", "proceeding": "https://proceedings.mlr.press/v162/de-luca22a.html", "poster": "/media/PosterPDFs/ICML%202022/9e3cfc48eccf81a0d57663e129aef3cb.png?t=1657392200.415797", "slides": "", "author_site": "Giuseppe Bruno De Luca, Eva Silverstein", "author": "Giuseppe Bruno De Luca; Eva Silverstein", "abstract": "We introduce a novel framework for optimization based on energy-conserving Hamiltonian dynamics in a strongly mixing (chaotic) regime and establish its key properties analytically and numerically. The prototype is a discretization of Born-Infeld dynamics, with a squared relativistic speed limit depending on the objective function. This class of frictionless, energy-conserving optimizers proceeds unobstructed until slowing naturally near the minimal loss, which dominates the phase space volume of the system. Building from studies of chaotic systems such as dynamical billiards, we formulate a specific algorithm with good performance on machine learning and PDE-solving tasks, including generalization. It cannot stop at a high local minimum, an advantage in non-convex loss functions, and proceeds faster than GD+momentum in shallow valleys.", "bibtex": "@InProceedings{pmlr-v162-de-luca22a,\n title = \t {Born-Infeld ({BI}) for {AI}: Energy-Conserving Descent ({ECD}) for Optimization},\n author = {De Luca, Giuseppe Bruno and Silverstein, Eva},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4918--4936},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/de-luca22a/de-luca22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/de-luca22a.html},\n abstract = \t {We introduce a novel framework for optimization based on energy-conserving Hamiltonian dynamics in a strongly mixing (chaotic) regime and establish its key properties analytically and numerically. The prototype is a discretization of Born-Infeld dynamics, with a squared relativistic speed limit depending on the objective function. This class of frictionless, energy-conserving optimizers proceeds unobstructed until slowing naturally near the minimal loss, which dominates the phase space volume of the system. Building from studies of chaotic systems such as dynamical billiards, we formulate a specific algorithm with good performance on machine learning and PDE-solving tasks, including generalization. It cannot stop at a high local minimum, an advantage in non-convex loss functions, and proceeds faster than GD+momentum in shallow valleys.}\n}", "pdf": "https://proceedings.mlr.press/v162/de-luca22a/de-luca22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/deluca22a-supp.zip", "pdf_size": 4612563, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11927103073322066327&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Stanford Institute for Theoretical Physics, Stanford University, Stanford, CA, 94306, USA; Stanford Institute for Theoretical Physics, Stanford University, Stanford, CA, 94306, USA", "aff_domain": "stanford.edu;stanford.edu", "email": "stanford.edu;stanford.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/de-luca22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Stanford Institute for Theoretical Physics", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Bounding Training Data Reconstruction in Private (Deep) Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16043", "id": "16043", "proceeding": "https://proceedings.mlr.press/v162/guo22c.html", "poster": "/media/PosterPDFs/ICML%202022/f73b76ce8949fe29bf2a537cfa420e8f.png?t=1657301298.1959615", "slides": "", "author_site": "Chuan Guo, Brian Karrer, Kamalika Chaudhuri, Laurens van der Maaten", "author": "Chuan Guo; Brian Karrer; Kamalika Chaudhuri; Laurens van der Maaten", "abstract": "Differential privacy is widely accepted as the de facto method for preventing data leakage in ML, and conventional wisdom suggests that it offers strong protection against privacy attacks. However, existing semantic guarantees for DP focus on membership inference, which may overestimate the adversary\u2019s capabilities and is not applicable when membership status itself is non-sensitive. In this paper, we derive the first semantic guarantees for DP mechanisms against training data reconstruction attacks under a formal threat model. We show that two distinct privacy accounting methods\u2014Renyi differential privacy and Fisher information leakage\u2014both offer strong semantic protection against data reconstruction attacks.", "bibtex": "@InProceedings{pmlr-v162-guo22c,\n title = \t {Bounding Training Data Reconstruction in Private (Deep) Learning},\n author = {Guo, Chuan and Karrer, Brian and Chaudhuri, Kamalika and van der Maaten, Laurens},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8056--8071},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guo22c/guo22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/guo22c.html},\n abstract = \t {Differential privacy is widely accepted as the de facto method for preventing data leakage in ML, and conventional wisdom suggests that it offers strong protection against privacy attacks. However, existing semantic guarantees for DP focus on membership inference, which may overestimate the adversary\u2019s capabilities and is not applicable when membership status itself is non-sensitive. In this paper, we derive the first semantic guarantees for DP mechanisms against training data reconstruction attacks under a formal threat model. We show that two distinct privacy accounting methods\u2014Renyi differential privacy and Fisher information leakage\u2014both offer strong semantic protection against data reconstruction attacks.}\n}", "pdf": "https://proceedings.mlr.press/v162/guo22c/guo22c.pdf", "supp": "", "pdf_size": 1706572, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3008455373482985083&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Meta AI; Meta; Meta AI; Meta AI", "aff_domain": "fb.com; ; ; ", "email": "fb.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/guo22c.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Meta AI", "aff_unique_url": "https://meta.com", "aff_unique_abbr": "Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bounding the Width of Neural Networks via Coupled Initialization A Worst Case Analysis", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16483", "id": "16483", "proceeding": "https://proceedings.mlr.press/v162/munteanu22a.html", "poster": "/media/PosterPDFs/ICML%202022/05b8caaf6ba6f4bdb68675ab8b893bda_VCvacrQ.png?t=1657880331.0063808", "slides": "", "author_site": "Alexander Munteanu, Simon Omlor, Zhao Song, David Woodruff", "author": "Alexander Munteanu; Simon Omlor; Zhao Song; David Woodruff", "abstract": "A common method in training neural networks is to initialize all the weights to be independent Gaussian vectors. We observe that by instead initializing the weights into independent pairs, where each pair consists of two identical Gaussian vectors, we can significantly improve the convergence analysis. While a similar technique has been studied for random inputs [Daniely, NeurIPS 2020], it has not been analyzed with arbitrary inputs. Using this technique, we show how to significantly reduce the number of neurons required for two-layer ReLU networks, both in the under-parameterized setting with logistic loss, from roughly $\\gamma^{-8}$ [Ji and Telgarsky, ICLR 2020] to $\\gamma^{-2}$, where $\\gamma$ denotes the separation margin with a Neural Tangent Kernel, as well as in the over-parameterized setting with squared loss, from roughly $n^4$ [Song and Yang, 2019] to $n^2$, implicitly also improving the recent running time bound of [Brand, Peng, Song and Weinstein, ITCS 2021]. For the under-parameterized setting we also prove new lower bounds that improve upon prior work, and that under certain assumptions, are best possible.", "bibtex": "@InProceedings{pmlr-v162-munteanu22a,\n title = \t {Bounding the Width of Neural Networks via Coupled Initialization A Worst Case Analysis},\n author = {Munteanu, Alexander and Omlor, Simon and Song, Zhao and Woodruff, David},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16083--16122},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/munteanu22a/munteanu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/munteanu22a.html},\n abstract = \t {A common method in training neural networks is to initialize all the weights to be independent Gaussian vectors. We observe that by instead initializing the weights into independent pairs, where each pair consists of two identical Gaussian vectors, we can significantly improve the convergence analysis. While a similar technique has been studied for random inputs [Daniely, NeurIPS 2020], it has not been analyzed with arbitrary inputs. Using this technique, we show how to significantly reduce the number of neurons required for two-layer ReLU networks, both in the under-parameterized setting with logistic loss, from roughly $\\gamma^{-8}$ [Ji and Telgarsky, ICLR 2020] to $\\gamma^{-2}$, where $\\gamma$ denotes the separation margin with a Neural Tangent Kernel, as well as in the over-parameterized setting with squared loss, from roughly $n^4$ [Song and Yang, 2019] to $n^2$, implicitly also improving the recent running time bound of [Brand, Peng, Song and Weinstein, ITCS 2021]. For the under-parameterized setting we also prove new lower bounds that improve upon prior work, and that under certain assumptions, are best possible.}\n}", "pdf": "https://proceedings.mlr.press/v162/munteanu22a/munteanu22a.pdf", "supp": "", "pdf_size": 767598, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6715564240178970214&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Dortmund Data Science Center, Faculties of Statistics and Computer Science, TU Dortmund University, Dortmund, Germany+Faculty of Statistics, TU Dortmund University, Dortmund, Germany; Faculty of Statistics, TU Dortmund University, Dortmund, Germany; Adobe Research; Department of Computer Science, Carnegie Mellon University, Pittsburgh, PA, USA", "aff_domain": "tu-dortmund.de;tu-dortmund.de;adobe.com;cs.cmu.edu", "email": "tu-dortmund.de;tu-dortmund.de;adobe.com;cs.cmu.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/munteanu22a.html", "aff_unique_index": "0+0;0;1;2", "aff_unique_norm": "TU Dortmund University;Adobe;Carnegie Mellon University", "aff_unique_dep": "Faculties of Statistics and Computer Science;Adobe Research;Department of Computer Science", "aff_unique_url": "https://www.tu-dortmund.de;https://research.adobe.com;https://www.cmu.edu", "aff_unique_abbr": "TU Dortmund;Adobe;CMU", "aff_campus_unique_index": "0+0;0;2", "aff_campus_unique": "Dortmund;;Pittsburgh", "aff_country_unique_index": "0+0;0;1;1", "aff_country_unique": "Germany;United States" }, { "title": "Branchformer: Parallel MLP-Attention Architectures to Capture Local and Global Context for Speech Recognition and Understanding", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18225", "id": "18225", "proceeding": "https://proceedings.mlr.press/v162/peng22a.html", "poster": "/media/PosterPDFs/ICML%202022/2adcfc3929e7c03fac3100d3ad51da26.png?t=1657747074.855753", "slides": "", "author_site": "Yifan Peng, Siddharth Dalmia, Ian Lane, Shinji Watanabe", "author": "Yifan Peng; Siddharth Dalmia; Ian Lane; Shinji Watanabe", "abstract": "Conformer has proven to be effective in many speech processing tasks. It combines the benefits of extracting local dependencies using convolutions and global dependencies using self-attention. Inspired by this, we propose a more flexible, interpretable and customizable encoder alternative, Branchformer, with parallel branches for modeling various ranged dependencies in end-to-end speech processing. In each encoder layer, one branch employs self-attention or its variant to capture long-range dependencies, while the other branch utilizes an MLP module with convolutional gating (cgMLP) to extract local relationships. We conduct experiments on several speech recognition and spoken language understanding benchmarks. Results show that our model outperforms both Transformer and cgMLP. It also matches with or outperforms state-of-the-art results achieved by Conformer. Furthermore, we show various strategies to reduce computation thanks to the two-branch architecture, including the ability to have variable inference complexity in a single trained model. The weights learned for merging branches indicate how local and global dependencies are utilized in different layers, which benefits model designing.", "bibtex": "@InProceedings{pmlr-v162-peng22a,\n title = \t {Branchformer: Parallel {MLP}-Attention Architectures to Capture Local and Global Context for Speech Recognition and Understanding},\n author = {Peng, Yifan and Dalmia, Siddharth and Lane, Ian and Watanabe, Shinji},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17627--17643},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/peng22a/peng22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/peng22a.html},\n abstract = \t {Conformer has proven to be effective in many speech processing tasks. It combines the benefits of extracting local dependencies using convolutions and global dependencies using self-attention. Inspired by this, we propose a more flexible, interpretable and customizable encoder alternative, Branchformer, with parallel branches for modeling various ranged dependencies in end-to-end speech processing. In each encoder layer, one branch employs self-attention or its variant to capture long-range dependencies, while the other branch utilizes an MLP module with convolutional gating (cgMLP) to extract local relationships. We conduct experiments on several speech recognition and spoken language understanding benchmarks. Results show that our model outperforms both Transformer and cgMLP. It also matches with or outperforms state-of-the-art results achieved by Conformer. Furthermore, we show various strategies to reduce computation thanks to the two-branch architecture, including the ability to have variable inference complexity in a single trained model. The weights learned for merging branches indicate how local and global dependencies are utilized in different layers, which benefits model designing.}\n}", "pdf": "https://proceedings.mlr.press/v162/peng22a/peng22a.pdf", "supp": "", "pdf_size": 1456689, "gs_citation": 200, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8709670323739096599&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA 15213, USA+Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA 15213, USA; Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA 15213, USA+Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA 15213, USA; Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA 15213, USA; Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA 15213, USA", "aff_domain": "andrew.cmu.edu;cs.cmu.edu; ;andrew.cmu.edu", "email": "andrew.cmu.edu;cs.cmu.edu; ;andrew.cmu.edu", "github": "https://github.com/espnet/espnet", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/peng22a.html", "aff_unique_index": "0+0;0+0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "0+0;0+0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0+0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "Branching Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17087", "id": "17087", "proceeding": "https://proceedings.mlr.press/v162/du22a.html", "poster": "/media/PosterPDFs/ICML%202022/5e6bd7a6970cd4325e587f02667f7f73.png?t=1657464880.3907561", "slides": "", "author_site": "Yihan Du, Wei Chen", "author": "Yihan Du; Wei Chen", "abstract": "In this paper, we propose a novel Branching Reinforcement Learning (Branching RL) model, and investigate both Regret Minimization (RM) and Reward-Free Exploration (RFE) metrics for this model. Unlike standard RL where the trajectory of each episode is a single $H$-step path, branching RL allows an agent to take multiple base actions in a state such that transitions branch out to multiple successor states correspondingly, and thus it generates a tree-structured trajectory. This model finds important applications in hierarchical recommendation systems and online advertising. For branching RL, we establish new Bellman equations and key lemmas, i.e., branching value difference lemma and branching law of total variance, and also bound the total variance by only $O(H^2)$ under an exponentially-large trajectory. For RM and RFE metrics, we propose computationally efficient algorithms BranchVI and BranchRFE, respectively, and derive nearly matching upper and lower bounds. Our regret and sample complexity results are polynomial in all problem parameters despite exponentially-large trajectories.", "bibtex": "@InProceedings{pmlr-v162-du22a,\n title = \t {Branching Reinforcement Learning},\n author = {Du, Yihan and Chen, Wei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5494--5530},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/du22a/du22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/du22a.html},\n abstract = \t {In this paper, we propose a novel Branching Reinforcement Learning (Branching RL) model, and investigate both Regret Minimization (RM) and Reward-Free Exploration (RFE) metrics for this model. Unlike standard RL where the trajectory of each episode is a single $H$-step path, branching RL allows an agent to take multiple base actions in a state such that transitions branch out to multiple successor states correspondingly, and thus it generates a tree-structured trajectory. This model finds important applications in hierarchical recommendation systems and online advertising. For branching RL, we establish new Bellman equations and key lemmas, i.e., branching value difference lemma and branching law of total variance, and also bound the total variance by only $O(H^2)$ under an exponentially-large trajectory. For RM and RFE metrics, we propose computationally efficient algorithms BranchVI and BranchRFE, respectively, and derive nearly matching upper and lower bounds. Our regret and sample complexity results are polynomial in all problem parameters despite exponentially-large trajectories.}\n}", "pdf": "https://proceedings.mlr.press/v162/du22a/du22a.pdf", "supp": "", "pdf_size": 1982364, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JlGJwFRjR8wJ:scholar.google.com/&scioq=Branching+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 5, "aff": "IIIS, Tsinghua University, Beijing, China; Microsoft Research", "aff_domain": "mails.tsinghua.edu.cn;microsoft.com", "email": "mails.tsinghua.edu.cn;microsoft.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/du22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Tsinghua University;Microsoft", "aff_unique_dep": "IIIS;Microsoft Research", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "THU;MSR", "aff_campus_unique_index": "0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Breaking Down Out-of-Distribution Detection: Many Methods Based on OOD Training Data Estimate a Combination of the Same Core Quantities", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17469", "id": "17469", "proceeding": "https://proceedings.mlr.press/v162/bitterwolf22a.html", "poster": "/media/PosterPDFs/ICML%202022/b6d67a24906e8a8541291882f81d31ca.png?t=1657717106.000094", "slides": "", "author_site": "Julian Bitterwolf, Alexander Meinke, Maximilian Augustin, Matthias Hein", "author": "Julian Bitterwolf; Alexander Meinke; Maximilian Augustin; Matthias Hein", "abstract": "It is an important problem in trustworthy machine learning to recognize out-of-distribution (OOD) inputs which are inputs unrelated to the in-distribution task. Many out-of-distribution detection methods have been suggested in recent years. The goal of this paper is to recognize common objectives as well as to identify the implicit scoring functions of different OOD detection methods. We focus on the sub-class of methods that use surrogate OOD data during training in order to learn an OOD detection score that generalizes to new unseen out-distributions at test time. We show that binary discrimination between in- and (different) out-distributions is equivalent to several distinct formulations of the OOD detection problem. When trained in a shared fashion with a standard classifier, this binary discriminator reaches an OOD detection performance similar to that of Outlier Exposure. Moreover, we show that the confidence loss which is used by Outlier Exposure has an implicit scoring function which differs in a non-trivial fashion from the theoretically optimal scoring function in the case where training and test out-distribution are the same, which again is similar to the one used when training an Energy-Based OOD detector or when adding a background class. In practice, when trained in exactly the same way, all these methods perform similarly.", "bibtex": "@InProceedings{pmlr-v162-bitterwolf22a,\n title = \t {Breaking Down Out-of-Distribution Detection: Many Methods Based on {OOD} Training Data Estimate a Combination of the Same Core Quantities},\n author = {Bitterwolf, Julian and Meinke, Alexander and Augustin, Maximilian and Hein, Matthias},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2041--2074},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bitterwolf22a/bitterwolf22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bitterwolf22a.html},\n abstract = \t {It is an important problem in trustworthy machine learning to recognize out-of-distribution (OOD) inputs which are inputs unrelated to the in-distribution task. Many out-of-distribution detection methods have been suggested in recent years. The goal of this paper is to recognize common objectives as well as to identify the implicit scoring functions of different OOD detection methods. We focus on the sub-class of methods that use surrogate OOD data during training in order to learn an OOD detection score that generalizes to new unseen out-distributions at test time. We show that binary discrimination between in- and (different) out-distributions is equivalent to several distinct formulations of the OOD detection problem. When trained in a shared fashion with a standard classifier, this binary discriminator reaches an OOD detection performance similar to that of Outlier Exposure. Moreover, we show that the confidence loss which is used by Outlier Exposure has an implicit scoring function which differs in a non-trivial fashion from the theoretically optimal scoring function in the case where training and test out-distribution are the same, which again is similar to the one used when training an Energy-Based OOD detector or when adding a background class. In practice, when trained in exactly the same way, all these methods perform similarly.}\n}", "pdf": "https://proceedings.mlr.press/v162/bitterwolf22a/bitterwolf22a.pdf", "supp": "", "pdf_size": 434800, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3629472061640674656&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "University of T\u00fcbingen; University of T\u00fcbingen; University of T\u00fcbingen; University of T\u00fcbingen", "aff_domain": "uni-tuebingen.de; ; ; ", "email": "uni-tuebingen.de; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/bitterwolf22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of T\u00fcbingen", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Breaking the $\\sqrtT$ Barrier: Instance-Independent Logarithmic Regret in Stochastic Contextual Linear Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16485", "id": "16485", "proceeding": "https://proceedings.mlr.press/v162/ghosh22b.html", "poster": "/media/PosterPDFs/ICML%202022/6a4cbdaedcbda0fa8ddc7ea32073c475.png?t=1658108519.9119184", "slides": "", "author_site": "Avishek Ghosh, Abishek Sankararaman", "author": "Avishek Ghosh; Abishek Sankararaman", "abstract": "We prove an instance independent (poly) logarithmic regret for stochastic contextual bandits with linear payoff. Previously, in \\cite{chu2011contextual}, a lower bound of $\\mathcal{O}(\\sqrt{T})$ is shown for the contextual linear bandit problem with arbitrary (adversarily chosen) contexts. In this paper, we show that stochastic contexts indeed help to reduce the regret from $\\sqrt{T}$ to $\\polylog(T)$. We propose Low Regret Stochastic Contextual Bandits (\\texttt{LR-SCB}), which takes advantage of the stochastic contexts and performs parameter estimation (in $\\ell_2$ norm) and regret minimization simultaneously. \\texttt{LR-SCB} works in epochs, where the parameter estimation of the previous epoch is used to reduce the regret of the current epoch. The (poly) logarithmic regret of \\texttt{LR-SCB} stems from two crucial facts: (a) the application of a norm adaptive algorithm to exploit the parameter estimation and (b) an analysis of the shifted linear contextual bandit algorithm, showing that shifting results in increasing regret. We have also shown experimentally that stochastic contexts indeed incurs a regret that scales with $\\polylog(T)$.", "bibtex": "@InProceedings{pmlr-v162-ghosh22b,\n title = \t {Breaking the $\\sqrt{T}$ Barrier: Instance-Independent Logarithmic Regret in Stochastic Contextual Linear Bandits},\n author = {Ghosh, Avishek and Sankararaman, Abishek},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7531--7549},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ghosh22b/ghosh22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/ghosh22b.html},\n abstract = \t {We prove an instance independent (poly) logarithmic regret for stochastic contextual bandits with linear payoff. Previously, in \\cite{chu2011contextual}, a lower bound of $\\mathcal{O}(\\sqrt{T})$ is shown for the contextual linear bandit problem with arbitrary (adversarily chosen) contexts. In this paper, we show that stochastic contexts indeed help to reduce the regret from $\\sqrt{T}$ to $\\polylog(T)$. We propose Low Regret Stochastic Contextual Bandits (\\texttt{LR-SCB}), which takes advantage of the stochastic contexts and performs parameter estimation (in $\\ell_2$ norm) and regret minimization simultaneously. \\texttt{LR-SCB} works in epochs, where the parameter estimation of the previous epoch is used to reduce the regret of the current epoch. The (poly) logarithmic regret of \\texttt{LR-SCB} stems from two crucial facts: (a) the application of a norm adaptive algorithm to exploit the parameter estimation and (b) an analysis of the shifted linear contextual bandit algorithm, showing that shifting results in increasing regret. We have also shown experimentally that stochastic contexts indeed incurs a regret that scales with $\\polylog(T)$.}\n}", "pdf": "https://proceedings.mlr.press/v162/ghosh22b/ghosh22b.pdf", "supp": "", "pdf_size": 805991, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mvvCfYHrE_0J:scholar.google.com/&scioq=Breaking+the+%24%5CsqrtT%24+Barrier:+Instance-Independent+Logarithmic+Regret+in+Stochastic+Contextual+Linear+Bandits&hl=en&as_sdt=0,5", "gs_version_total": 8, "aff": "Hal\u0131c\u0131o\u011flu Data Science Institute (HDSI), UC San Diego, USA; Dept. of Electrical Engg. and Computer Sciences, UC Berkeley, USA + Amazon AWS AI, Palo Alto, USA", "aff_domain": "ucsd.edu; ", "email": "ucsd.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/ghosh22b.html", "aff_unique_index": "0;1+2", "aff_unique_norm": "University of California, San Diego;University of California, Berkeley;Amazon", "aff_unique_dep": "Hal\u0131c\u0131o\u011flu Data Science Institute;Department of Electrical Engineering and Computer Sciences;Amazon AWS AI", "aff_unique_url": "https://ucsd.edu;https://www.berkeley.edu;https://aws.amazon.com", "aff_unique_abbr": "UCSD;UC Berkeley;Amazon", "aff_campus_unique_index": "0;1+2", "aff_campus_unique": "San Diego;Berkeley;Palo Alto", "aff_country_unique_index": "0;0+0", "aff_country_unique": "United States" }, { "title": "Bregman Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17633", "id": "17633", "proceeding": "https://proceedings.mlr.press/v162/frecon22a.html", "poster": "/media/PosterPDFs/ICML%202022/d7488039246a405baf6a7cbc3613a56f_YzxOP2Z.png?t=1657265939.5943394", "slides": "", "author_site": "Jordan Frecon, Gilles Gasso, Massimiliano Pontil, Saverio Salzo", "author": "Jordan Frecon; Gilles Gasso; Massimiliano Pontil; Saverio Salzo", "abstract": "We present a framework based on bilevel optimization for learning multilayer, deep data representations. On the one hand, the lower-level problem finds a representation by successively minimizing layer-wise objectives made of the sum of a prescribed regularizer as well as a fidelity term and some linear function both depending on the representation found at the previous layer. On the other hand, the upper-level problem optimizes over the linear functions to yield a linearly separable final representation. We show that, by choosing the fidelity term as the quadratic distance between two successive layer-wise representations, the bilevel problem reduces to the training of a feed-forward neural network. Instead, by elaborating on Bregman distances, we devise a novel neural network architecture additionally involving the inverse of the activation function reminiscent of the skip connection used in ResNets. Numerical experiments suggest that the proposed Bregman variant benefits from better learning properties and more robust prediction performance.", "bibtex": "@InProceedings{pmlr-v162-frecon22a,\n title = \t {{B}regman Neural Networks},\n author = {Frecon, Jordan and Gasso, Gilles and Pontil, Massimiliano and Salzo, Saverio},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6779--6792},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/frecon22a/frecon22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/frecon22a.html},\n abstract = \t {We present a framework based on bilevel optimization for learning multilayer, deep data representations. On the one hand, the lower-level problem finds a representation by successively minimizing layer-wise objectives made of the sum of a prescribed regularizer as well as a fidelity term and some linear function both depending on the representation found at the previous layer. On the other hand, the upper-level problem optimizes over the linear functions to yield a linearly separable final representation. We show that, by choosing the fidelity term as the quadratic distance between two successive layer-wise representations, the bilevel problem reduces to the training of a feed-forward neural network. Instead, by elaborating on Bregman distances, we devise a novel neural network architecture additionally involving the inverse of the activation function reminiscent of the skip connection used in ResNets. Numerical experiments suggest that the proposed Bregman variant benefits from better learning properties and more robust prediction performance.}\n}", "pdf": "https://proceedings.mlr.press/v162/frecon22a/frecon22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/frecon22a-supp.zip", "pdf_size": 757263, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5056676567893443166&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Normandie Univ, INSA Rouen UNIROUEN, UNIHA VRE, LITIS, Saint-Etienne-du-Rouvray, France; Normandie Univ, INSA Rouen UNIROUEN, UNIHA VRE, LITIS, Saint-Etienne-du-Rouvray, France; Computational Statistics and Machine Learning, IIT, Genova, Italy+Departement of Computer Science, UCL, London, United Kingdom; Computational Statistics and Machine Learning, IIT, Genova, Italy", "aff_domain": "insa-rouen.fr; ; ; ", "email": "insa-rouen.fr; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/frecon22a.html", "aff_unique_index": "0;0;1+2;1", "aff_unique_norm": "Normandie University;Istituto Italiano di Tecnologia;University College London", "aff_unique_dep": "INSA Rouen;Computational Statistics and Machine Learning;Department of Computer Science", "aff_unique_url": ";https://www.iit.it;https://www.ucl.ac.uk", "aff_unique_abbr": ";IIT;UCL", "aff_campus_unique_index": "1+2;1", "aff_campus_unique": ";Genova;London", "aff_country_unique_index": "0;0;1+2;1", "aff_country_unique": "France;Italy;United Kingdom" }, { "title": "Bregman Power k-Means for Clustering Exponential Family Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16491", "id": "16491", "proceeding": "https://proceedings.mlr.press/v162/vellal22a.html", "poster": "/media/PosterPDFs/ICML%202022/d60678e8f2ba9c540798ebbde31177e8.png?t=1657665697.3816583", "slides": "", "author_site": "Adithya D Vellal, Saptarshi Chakraborty, Jason Xu", "author": "Adithya Vellal; Saptarshi Chakraborty; Jason Q Xu", "abstract": "Recent progress in center-based clustering algorithms combats poor local minima by implicit annealing through a family of generalized means. These methods are variations of Lloyd\u2019s celebrated k-means algorithm, and are most appropriate for spherical clusters such as those arising from Gaussian data. In this paper, we bridge these new algorithmic advances to classical work on hard clustering under Bregman divergences, which enjoy a bijection to exponential family distributions and are thus well-suited for clustering objects arising from a breadth of data generating mechanisms. The elegant properties of Bregman divergences allow us to maintain closed form updates in a simple and transparent algorithm, and moreover lead to new theoretical arguments for establishing finite sample bounds that relax the bounded support assumption made in the existing state of the art. Additionally, we consider thorough empirical analyses on simulated experiments and a case study on rainfall data, finding that the proposed method outperforms existing peer methods in a variety of non-Gaussian data settings.", "bibtex": "@InProceedings{pmlr-v162-vellal22a,\n title = \t {{B}regman Power k-Means for Clustering Exponential Family Data},\n author = {Vellal, Adithya and Chakraborty, Saptarshi and Xu, Jason Q},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22103--22119},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vellal22a/vellal22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/vellal22a.html},\n abstract = \t {Recent progress in center-based clustering algorithms combats poor local minima by implicit annealing through a family of generalized means. These methods are variations of Lloyd\u2019s celebrated k-means algorithm, and are most appropriate for spherical clusters such as those arising from Gaussian data. In this paper, we bridge these new algorithmic advances to classical work on hard clustering under Bregman divergences, which enjoy a bijection to exponential family distributions and are thus well-suited for clustering objects arising from a breadth of data generating mechanisms. The elegant properties of Bregman divergences allow us to maintain closed form updates in a simple and transparent algorithm, and moreover lead to new theoretical arguments for establishing finite sample bounds that relax the bounded support assumption made in the existing state of the art. Additionally, we consider thorough empirical analyses on simulated experiments and a case study on rainfall data, finding that the proposed method outperforms existing peer methods in a variety of non-Gaussian data settings.}\n}", "pdf": "https://proceedings.mlr.press/v162/vellal22a/vellal22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/vellal22a-supp.zip", "pdf_size": 558498, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10416936130963333532&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Statistical Science, Duke University, Durham, NC, USA; Department of Statistics, University of California, Berkeley, CA, USA; Department of Statistical Science, Duke University, Durham, NC, USA", "aff_domain": "duke.edu;berkeley.edu;duke.edu", "email": "duke.edu;berkeley.edu;duke.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/vellal22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Duke University;University of California, Berkeley", "aff_unique_dep": "Department of Statistical Science;Department of Statistics", "aff_unique_url": "https://www.duke.edu;https://www.berkeley.edu", "aff_unique_abbr": "Duke;UC Berkeley", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Durham;Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Bregman Proximal Langevin Monte Carlo via Bregman-Moreau Envelopes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16695", "id": "16695", "proceeding": "https://proceedings.mlr.press/v162/lau22a.html", "poster": "/media/PosterPDFs/ICML%202022/95e6834d0a3d99e9ea8811855ae9229d.png?t=1657963571.4859438", "slides": "", "author_site": "Tim Tsz-Kit Lau, Han Liu", "author": "Tim Tsz-Kit Lau; Han Liu", "abstract": "We propose efficient Langevin Monte Carlo algorithms for sampling distributions with nonsmooth convex composite potentials, which is the sum of a continuously differentiable function and a possibly nonsmooth function. We devise such algorithms leveraging recent advances in convex analysis and optimization methods involving Bregman divergences, namely the Bregman\u2013Moreau envelopes and the Bregman proximity operators, and in the Langevin Monte Carlo algorithms reminiscent of mirror descent. The proposed algorithms extend existing Langevin Monte Carlo algorithms in two aspects\u2014the ability to sample nonsmooth distributions with mirror descent-like algorithms, and the use of the more general Bregman\u2013Moreau envelope in place of the Moreau envelope as a smooth approximation of the nonsmooth part of the potential. A particular case of the proposed scheme is reminiscent of the Bregman proximal gradient algorithm. The efficiency of the proposed methodology is illustrated with various sampling tasks at which existing Langevin Monte Carlo methods are known to perform poorly.", "bibtex": "@InProceedings{pmlr-v162-lau22a,\n title = \t {{B}regman Proximal {L}angevin {M}onte {C}arlo via {B}regman-Moreau Envelopes},\n author = {Lau, Tim Tsz-Kit and Liu, Han},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12049--12077},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lau22a/lau22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lau22a.html},\n abstract = \t {We propose efficient Langevin Monte Carlo algorithms for sampling distributions with nonsmooth convex composite potentials, which is the sum of a continuously differentiable function and a possibly nonsmooth function. We devise such algorithms leveraging recent advances in convex analysis and optimization methods involving Bregman divergences, namely the Bregman\u2013Moreau envelopes and the Bregman proximity operators, and in the Langevin Monte Carlo algorithms reminiscent of mirror descent. The proposed algorithms extend existing Langevin Monte Carlo algorithms in two aspects\u2014the ability to sample nonsmooth distributions with mirror descent-like algorithms, and the use of the more general Bregman\u2013Moreau envelope in place of the Moreau envelope as a smooth approximation of the nonsmooth part of the potential. A particular case of the proposed scheme is reminiscent of the Bregman proximal gradient algorithm. The efficiency of the proposed methodology is illustrated with various sampling tasks at which existing Langevin Monte Carlo methods are known to perform poorly.}\n}", "pdf": "https://proceedings.mlr.press/v162/lau22a/lau22a.pdf", "supp": "", "pdf_size": 2164364, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5379410108101548560&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Statistics and Data Science, Northwestern University, Evanston, IL, USA; Department of Computer Science, Northwestern University, Evanston, IL, USA", "aff_domain": "northwestern.edu; ", "email": "northwestern.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/lau22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Northwestern University", "aff_unique_dep": "Department of Statistics and Data Science", "aff_unique_url": "https://www.northwestern.edu", "aff_unique_abbr": "NU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Evanston", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Building Robust Ensembles via Margin Boosting", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17005", "id": "17005", "proceeding": "https://proceedings.mlr.press/v162/zhang22aj.html", "poster": "/media/PosterPDFs/ICML%202022/dc5689792e08eb2e219dce49e64c885b.png?t=1657241491.2933865", "slides": "", "author_site": "Dinghuai Zhang, Hongyang Zhang, Aaron Courville, Yoshua Bengio, Pradeep Ravikumar, Arun Sai Suggala", "author": "Dinghuai Zhang; Hongyang Zhang; Aaron Courville; Yoshua Bengio; Pradeep Ravikumar; Arun Sai Suggala", "abstract": "In the context of adversarial robustness, a single model does not usually have enough power to defend against all possible adversarial attacks, and as a result, has sub-optimal robustness. Consequently, an emerging line of work has focused on learning an ensemble of neural networks to defend against adversarial attacks. In this work, we take a principled approach towards building robust ensembles. We view this problem from the perspective of margin-boosting and develop an algorithm for learning an ensemble with maximum margin. Through extensive empirical evaluation on benchmark datasets, we show that our algorithm not only outperforms existing ensembling techniques, but also large models trained in an end-to-end fashion. An important byproduct of our work is a margin-maximizing cross-entropy (MCE) loss, which is a better alternative to the standard cross-entropy (CE) loss. Empirically, we show that replacing the CE loss in state-of-the-art adversarial training techniques with our MCE loss leads to significant performance improvement.", "bibtex": "@InProceedings{pmlr-v162-zhang22aj,\n title = \t {Building Robust Ensembles via Margin Boosting},\n author = {Zhang, Dinghuai and Zhang, Hongyang and Courville, Aaron and Bengio, Yoshua and Ravikumar, Pradeep and Suggala, Arun Sai},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26669--26692},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22aj/zhang22aj.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22aj.html},\n abstract = \t {In the context of adversarial robustness, a single model does not usually have enough power to defend against all possible adversarial attacks, and as a result, has sub-optimal robustness. Consequently, an emerging line of work has focused on learning an ensemble of neural networks to defend against adversarial attacks. In this work, we take a principled approach towards building robust ensembles. We view this problem from the perspective of margin-boosting and develop an algorithm for learning an ensemble with maximum margin. Through extensive empirical evaluation on benchmark datasets, we show that our algorithm not only outperforms existing ensembling techniques, but also large models trained in an end-to-end fashion. An important byproduct of our work is a margin-maximizing cross-entropy (MCE) loss, which is a better alternative to the standard cross-entropy (CE) loss. Empirically, we show that replacing the CE loss in state-of-the-art adversarial training techniques with our MCE loss leads to significant performance improvement.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22aj/zhang22aj.pdf", "supp": "", "pdf_size": 569098, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13608655782211931186&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Mila and Universit \u00b4e de Montr \u00b4eal; University of Waterloo; Mila and Universit \u00b4e de Montr \u00b4eal; Mila and Universit \u00b4e de Montr \u00b4eal; Carnegie Mellon University; Google Research", "aff_domain": "mila.quebec; ; ; ; ; ", "email": "mila.quebec; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhang22aj.html", "aff_unique_index": "0;1;0;0;2;3", "aff_unique_norm": "Universit \u00b4e de Montr \u00b4eal;University of Waterloo;Carnegie Mellon University;Google", "aff_unique_dep": ";;;Google Research", "aff_unique_url": "https://www.udem\u8499\u7279\u5229\u5c14\u5927\u5b66.ca;https://uwaterloo.ca;https://www.cmu.edu;https://research.google", "aff_unique_abbr": "UdeM;UW;CMU;Google Research", "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Montreal;;Mountain View", "aff_country_unique_index": "0;0;0;0;1;1", "aff_country_unique": "Canada;United States" }, { "title": "Burst-Dependent Plasticity and Dendritic Amplification Support Target-Based Learning and Hierarchical Imitation Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16987", "id": "16987", "proceeding": "https://proceedings.mlr.press/v162/capone22b.html", "poster": "/media/PosterPDFs/ICML%202022/674bfc5f6b72706fb769f5e93667bd23.png?t=1657626528.4418044", "slides": "", "author_site": "Cristiano Capone, Cosimo Lupo, Paolo Muratore, Pier Stanislao Paolucci", "author": "Cristiano Capone; Cosimo Lupo; Paolo Muratore; Pier Stanislao Paolucci", "abstract": "The brain can learn to solve a wide range of tasks with high temporal and energetic efficiency. However, most biological models are composed of simple single-compartment neurons and cannot achieve the state-of-the-art performances of artificial intelligence. We propose a multi-compartment model of pyramidal neuron, in which bursts and dendritic input segregation give the possibility to plausibly support a biological target-based learning. In target-based learning, the internal solution of a problem (a spatio-temporal pattern of bursts in our case) is suggested to the network, bypassing the problems of error backpropagation and credit assignment. Finally, we show that this neuronal architecture naturally supports the orchestration of \u201chierarchical imitation learning\u201d, enabling the decomposition of challenging long-horizon decision-making tasks into simpler subtasks.", "bibtex": "@InProceedings{pmlr-v162-capone22b,\n title = \t {Burst-Dependent Plasticity and Dendritic Amplification Support Target-Based Learning and Hierarchical Imitation Learning},\n author = {Capone, Cristiano and Lupo, Cosimo and Muratore, Paolo and Paolucci, Pier Stanislao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2625--2637},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/capone22b/capone22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/capone22b.html},\n abstract = \t {The brain can learn to solve a wide range of tasks with high temporal and energetic efficiency. However, most biological models are composed of simple single-compartment neurons and cannot achieve the state-of-the-art performances of artificial intelligence. We propose a multi-compartment model of pyramidal neuron, in which bursts and dendritic input segregation give the possibility to plausibly support a biological target-based learning. In target-based learning, the internal solution of a problem (a spatio-temporal pattern of bursts in our case) is suggested to the network, bypassing the problems of error backpropagation and credit assignment. Finally, we show that this neuronal architecture naturally supports the orchestration of \u201chierarchical imitation learning\u201d, enabling the decomposition of challenging long-horizon decision-making tasks into simpler subtasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/capone22b/capone22b.pdf", "supp": "", "pdf_size": 3088941, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8004952254033817821&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "INFN, Sezione di Roma, Rome, Italy; INFN, Sezione di Roma, Rome, Italy; SISSA, International School for Advanced Studies, Trieste, Italy; INFN, Sezione di Roma, Rome, Italy", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/capone22b.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "INFN;International School for Advanced Studies", "aff_unique_dep": "Sezione di Roma;", "aff_unique_url": "https://www.infn.it;https://www.sissa.it", "aff_unique_abbr": "INFN;SISSA", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Rome;Trieste", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Italy" }, { "title": "ButterflyFlow: Building Invertible Layers with Butterfly Matrices", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16919", "id": "16919", "proceeding": "https://proceedings.mlr.press/v162/meng22a.html", "poster": "/media/PosterPDFs/ICML%202022/761e6675f9e54673cc778e7fdb2823d2.png?t=1657662236.5284092", "slides": "", "author_site": "Chenlin Meng, Linqi Zhou, Kristy Choi, Tri Dao, Stefano Ermon", "author": "Chenlin Meng; Linqi Zhou; Kristy Choi; Tri Dao; Stefano Ermon", "abstract": "Normalizing flows model complex probability distributions using maps obtained by composing invertible layers. Special linear layers such as masked and 1{\\texttimes}1 convolutions play a key role in existing architectures because they increase expressive power while having tractable Jacobians and inverses. We propose a new family of invertible linear layers based on butterfly layers, which are known to theoretically capture complex linear structures including permutations and periodicity, yet can be inverted efficiently. This representational power is a key advantage of our approach, as such structures are common in many real-world datasets. Based on our invertible butterfly layers, we construct a new class of normalizing flow mod- els called ButterflyFlow. Empirically, we demonstrate that ButterflyFlows not only achieve strong density estimation results on natural images such as MNIST, CIFAR-10, and ImageNet-32{\\texttimes}32, but also obtain significantly better log-likelihoods on structured datasets such as galaxy images and MIMIC-III patient cohorts{\u2014}all while being more efficient in terms of memory and computation than relevant baselines.", "bibtex": "@InProceedings{pmlr-v162-meng22a,\n title = \t {{B}utterfly{F}low: Building Invertible Layers with Butterfly Matrices},\n author = {Meng, Chenlin and Zhou, Linqi and Choi, Kristy and Dao, Tri and Ermon, Stefano},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15360--15375},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/meng22a/meng22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/meng22a.html},\n abstract = \t {Normalizing flows model complex probability distributions using maps obtained by composing invertible layers. Special linear layers such as masked and 1{\\texttimes}1 convolutions play a key role in existing architectures because they increase expressive power while having tractable Jacobians and inverses. We propose a new family of invertible linear layers based on butterfly layers, which are known to theoretically capture complex linear structures including permutations and periodicity, yet can be inverted efficiently. This representational power is a key advantage of our approach, as such structures are common in many real-world datasets. Based on our invertible butterfly layers, we construct a new class of normalizing flow mod- els called ButterflyFlow. Empirically, we demonstrate that ButterflyFlows not only achieve strong density estimation results on natural images such as MNIST, CIFAR-10, and ImageNet-32{\\texttimes}32, but also obtain significantly better log-likelihoods on structured datasets such as galaxy images and MIMIC-III patient cohorts{\u2014}all while being more efficient in terms of memory and computation than relevant baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/meng22a/meng22a.pdf", "supp": "", "pdf_size": 1559619, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1946519107203233584&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Computer Science Department, Stanford University; Computer Science Department, Stanford University; Computer Science Department, Stanford University; Computer Science Department, Stanford University; Computer Science Department, Stanford University", "aff_domain": "cs.stanford.edu;stanford.edu;cs.stanford.edu; ;cs.stanford.edu", "email": "cs.stanford.edu;stanford.edu;cs.stanford.edu; ;cs.stanford.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/meng22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Byzantine Machine Learning Made Easy By Resilient Averaging of Momentums", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17605", "id": "17605", "proceeding": "https://proceedings.mlr.press/v162/farhadkhani22a.html", "poster": "/media/PosterPDFs/ICML%202022/0a1bf96b7165e962e90cb14648c9462d_KdGkEfx.png?t=1657889260.6203449", "slides": "", "author_site": "Sadegh Farhadkhani, Rachid Guerraoui, Nirupam Gupta, Rafael Pinot, John Stephan", "author": "Sadegh Farhadkhani; Rachid Guerraoui; Nirupam Gupta; Rafael Pinot; John Stephan", "abstract": "Byzantine resilience emerged as a prominent topic within the distributed machine learning community. Essentially, the goal is to enhance distributed optimization algorithms, such as distributed SGD, in a way that guarantees convergence despite the presence of some misbehaving (a.k.a.,", "bibtex": "@InProceedings{pmlr-v162-farhadkhani22a,\n title = \t {{B}yzantine Machine Learning Made Easy By Resilient Averaging of Momentums},\n author = {Farhadkhani, Sadegh and Guerraoui, Rachid and Gupta, Nirupam and Pinot, Rafael and Stephan, John},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6246--6283},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/farhadkhani22a/farhadkhani22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/farhadkhani22a.html},\n abstract = \t {Byzantine resilience emerged as a prominent topic within the distributed machine learning community. Essentially, the goal is to enhance distributed optimization algorithms, such as distributed SGD, in a way that guarantees convergence despite the presence of some misbehaving (a.k.a.,", "pdf": "https://proceedings.mlr.press/v162/farhadkhani22a/farhadkhani22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/farhadkhani22a-supp.zip", "pdf_size": 2950582, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14837588904512059895&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Distributed Computing Laboratory (DCL), School of Computer and Communication Sciences, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne (EPFL), Switzerland; Distributed Computing Laboratory (DCL), School of Computer and Communication Sciences, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne (EPFL), Switzerland; Distributed Computing Laboratory (DCL), School of Computer and Communication Sciences, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne (EPFL), Switzerland; Distributed Computing Laboratory (DCL), School of Computer and Communication Sciences, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne (EPFL), Switzerland; Distributed Computing Laboratory (DCL), School of Computer and Communication Sciences, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne (EPFL), Switzerland", "aff_domain": "epfl.ch;epfl.ch; ; ; ", "email": "epfl.ch;epfl.ch; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/farhadkhani22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "School of Computer and Communication Sciences", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "C*-algebra Net: A New Approach Generalizing Neural Network Parameters to C*-algebra", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16083", "id": "16083", "proceeding": "https://proceedings.mlr.press/v162/hashimoto22a.html", "poster": "/media/PosterPDFs/ICML%202022/aa2a77371374094fe9e0bc1de3f94ed9.png?t=1657429108.289362", "slides": "", "author_site": "Yuka Hashimoto, Zhao Wang, Tomoko Matsui", "author": "Yuka Hashimoto; Zhao Wang; Tomoko Matsui", "abstract": "We propose a new framework that generalizes the parameters of neural network models to $C^*$-algebra-valued ones. $C^*$-algebra is a generalization of the space of complex numbers. A typical example is the space of continuous functions on a compact space. This generalization enables us to combine multiple models continuously and use tools for functions such as regression and integration. Consequently, we can learn features of data efficiently and adapt the models to problems continuously. We apply our framework to practical problems such as density estimation and few-shot learning and show that our framework enables us to learn features of data even with a limited number of samples. Our new framework highlights the potential possibility of applying the theory of $C^*$-algebra to general neural network models.", "bibtex": "@InProceedings{pmlr-v162-hashimoto22a,\n title = \t {C*-algebra Net: A New Approach Generalizing Neural Network Parameters to C*-algebra},\n author = {Hashimoto, Yuka and Wang, Zhao and Matsui, Tomoko},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8523--8534},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hashimoto22a/hashimoto22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hashimoto22a.html},\n abstract = \t {We propose a new framework that generalizes the parameters of neural network models to $C^*$-algebra-valued ones. $C^*$-algebra is a generalization of the space of complex numbers. A typical example is the space of continuous functions on a compact space. This generalization enables us to combine multiple models continuously and use tools for functions such as regression and integration. Consequently, we can learn features of data efficiently and adapt the models to problems continuously. We apply our framework to practical problems such as density estimation and few-shot learning and show that our framework enables us to learn features of data even with a limited number of samples. Our new framework highlights the potential possibility of applying the theory of $C^*$-algebra to general neural network models.}\n}", "pdf": "https://proceedings.mlr.press/v162/hashimoto22a/hashimoto22a.pdf", "supp": "", "pdf_size": 1909152, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13619500713183243030&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "NTT Network Service Systems Laboratories, NTT Corporation, Tokyo, Japan+Institute for Disaster Response Robotics, Waseda University, Tokyo, Japan; NTT Network Service Systems Laboratories, NTT Corporation, Tokyo, Japan+Institute for Disaster Response Robotics, Waseda University, Tokyo, Japan; Department of Statistical Modeling, the Institute of Statistical Mathematics, Tokyo, Japan", "aff_domain": "hco.ntt.co.jp; ; ", "email": "hco.ntt.co.jp; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/hashimoto22a.html", "aff_unique_index": "0+1;0+1;2", "aff_unique_norm": "NTT Corporation;Waseda University;Institute of Statistical Mathematics", "aff_unique_dep": "Network Service Systems Laboratories;Institute for Disaster Response Robotics;Department of Statistical Modeling", "aff_unique_url": "https://www.ntt.co.jp;https://www.waseda.jp/top;https://www.ism.ac.jp", "aff_unique_abbr": "NTT;Waseda;ISM", "aff_campus_unique_index": "0+0;0+0;0", "aff_campus_unique": "Tokyo", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "Japan" }, { "title": "C-MinHash: Improving Minwise Hashing with Circulant Permutation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17815", "id": "17815", "proceeding": "https://proceedings.mlr.press/v162/li22m.html", "poster": "", "slides": "", "author_site": "Xiaoyun Li, Ping Li", "author": "Xiaoyun Li; Ping Li", "abstract": "Minwise hashing (MinHash) is an important and practical algorithm for generating random hashes to approximate the Jaccard (resemblance) similarity in massive binary (0/1) data. The basic theory of MinHash requires applying hundreds or even thousands of independent random permutations to each data vector in the dataset, in order to obtain reliable results for (e.g.,) building large-scale learning models or approximate near neighbor search. In this paper, we propose Circulant MinHash (C-MinHash) and provide the surprising theoretical results that using only two independent random permutations in a circulant manner leads to uniformly smaller Jaccard estimation variance than that of the classical MinHash with K independent permutations. Experiments are conducted to show the effectiveness of the proposed method. We also propose a more convenient C-MinHash variant which reduces two permutations to just one, with extensive numerical results to validate that it achieves essentially the same estimation accuracy as using two permutations.", "bibtex": "@InProceedings{pmlr-v162-li22m,\n title = \t {C-{M}in{H}ash: Improving Minwise Hashing with Circulant Permutation},\n author = {Li, Xiaoyun and Li, Ping},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12857--12887},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22m/li22m.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22m.html},\n abstract = \t {Minwise hashing (MinHash) is an important and practical algorithm for generating random hashes to approximate the Jaccard (resemblance) similarity in massive binary (0/1) data. The basic theory of MinHash requires applying hundreds or even thousands of independent random permutations to each data vector in the dataset, in order to obtain reliable results for (e.g.,) building large-scale learning models or approximate near neighbor search. In this paper, we propose Circulant MinHash (C-MinHash) and provide the surprising theoretical results that using only two independent random permutations in a circulant manner leads to uniformly smaller Jaccard estimation variance than that of the classical MinHash with K independent permutations. Experiments are conducted to show the effectiveness of the proposed method. We also propose a more convenient C-MinHash variant which reduces two permutations to just one, with extensive numerical results to validate that it achieves essentially the same estimation accuracy as using two permutations.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22m/li22m.pdf", "supp": "", "pdf_size": 7947475, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7553255132887924050&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Cognitive Computing Lab, Baidu Research; Cognitive Computing Lab, Baidu Research", "aff_domain": "gmail.com;gmail.com", "email": "gmail.com;gmail.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/li22m.html", "aff_unique_index": "0;0", "aff_unique_norm": "Baidu", "aff_unique_dep": "Cognitive Computing Lab", "aff_unique_url": "https://baidu.com", "aff_unique_abbr": "Baidu", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "CITRIS: Causal Identifiability from Temporal Intervened Sequences", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17425", "id": "17425", "proceeding": "https://proceedings.mlr.press/v162/lippe22a.html", "poster": "/media/PosterPDFs/ICML%202022/43cca4b3de2097b9558efefd0ecc3588_USh52N9.png?t=1657548651.1359744", "slides": "", "author_site": "Phillip Lippe, Sara Magliacane, Sindy L\u00f6we, Yuki Asano, Taco Cohen, Stratis Gavves", "author": "Phillip Lippe; Sara Magliacane; Sindy L\u00f6we; Yuki M Asano; Taco Cohen; Stratis Gavves", "abstract": "Understanding the latent causal factors of a dynamical system from visual observations is considered a crucial step towards agents reasoning in complex environments. In this paper, we propose CITRIS, a variational autoencoder framework that learns causal representations from temporal sequences of images in which underlying causal factors have possibly been intervened upon. In contrast to the recent literature, CITRIS exploits temporality and observing intervention targets to identify scalar and multidimensional causal factors, such as 3D rotation angles. Furthermore, by introducing a normalizing flow, CITRIS can be easily extended to leverage and disentangle representations obtained by already pretrained autoencoders. Extending previous results on scalar causal factors, we prove identifiability in a more general setting, in which only some components of a causal factor are affected by interventions. In experiments on 3D rendered image sequences, CITRIS outperforms previous methods on recovering the underlying causal variables. Moreover, using pretrained autoencoders, CITRIS can even generalize to unseen instantiations of causal factors, opening future research areas in sim-to-real generalization for causal representation learning.", "bibtex": "@InProceedings{pmlr-v162-lippe22a,\n title = \t {{CITRIS}: Causal Identifiability from Temporal Intervened Sequences},\n author = {Lippe, Phillip and Magliacane, Sara and L{\\\"o}we, Sindy and Asano, Yuki M and Cohen, Taco and Gavves, Stratis},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13557--13603},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lippe22a/lippe22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lippe22a.html},\n abstract = \t {Understanding the latent causal factors of a dynamical system from visual observations is considered a crucial step towards agents reasoning in complex environments. In this paper, we propose CITRIS, a variational autoencoder framework that learns causal representations from temporal sequences of images in which underlying causal factors have possibly been intervened upon. In contrast to the recent literature, CITRIS exploits temporality and observing intervention targets to identify scalar and multidimensional causal factors, such as 3D rotation angles. Furthermore, by introducing a normalizing flow, CITRIS can be easily extended to leverage and disentangle representations obtained by already pretrained autoencoders. Extending previous results on scalar causal factors, we prove identifiability in a more general setting, in which only some components of a causal factor are affected by interventions. In experiments on 3D rendered image sequences, CITRIS outperforms previous methods on recovering the underlying causal variables. Moreover, using pretrained autoencoders, CITRIS can even generalize to unseen instantiations of causal factors, opening future research areas in sim-to-real generalization for causal representation learning.}\n}", "pdf": "https://proceedings.mlr.press/v162/lippe22a/lippe22a.pdf", "supp": "", "pdf_size": 3830308, "gs_citation": 125, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9740161650140858183&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/lippe22a.html" }, { "title": "COAT: Measuring Object Compositionality in Emergent Representations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16677", "id": "16677", "proceeding": "https://proceedings.mlr.press/v162/xie22b.html", "poster": "/media/PosterPDFs/ICML%202022/043c2ec6c6390dd0ac5519190a57c88c_mvNGqGB.png?t=1657666758.9266505", "slides": "", "author_site": "Sirui Xie, Ari Morcos, Song-Chun Zhu, Shanmukha Ramakrishna Vedantam", "author": "Sirui Xie; Ari S Morcos; Song-Chun Zhu; Ramakrishna Vedantam", "abstract": "Learning representations that can decompose a multi-object scene into its constituent objects and recompose them flexibly is desirable for object-oriented reasoning and planning. Built upon object masks in the pixel space, existing metrics for objectness can only evaluate generative models with an object-specific \u201cslot\u201d structure. We propose to directly measure compositionality in the representation space as a form of objections, making such evaluations tractable for a wider class of models. Our metric, COAT (Compositional Object Algebra Test), evaluates if a generic representation exhibits certain geometric properties that underpin object compositionality beyond what is already captured by the raw pixel space. Our experiments on the popular CLEVR (Johnson et.al., 2018) domain reveal that existing disentanglement-based generative models are not as compositional as one might expect, suggesting room for further modeling improvements. We hope our work allows for a unified evaluation of object-centric representations, spanning generative as well as discriminative, self-supervised models.", "bibtex": "@InProceedings{pmlr-v162-xie22b,\n title = \t {{COAT}: Measuring Object Compositionality in Emergent Representations},\n author = {Xie, Sirui and Morcos, Ari S and Zhu, Song-Chun and Vedantam, Ramakrishna},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24388--24413},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xie22b/xie22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/xie22b.html},\n abstract = \t {Learning representations that can decompose a multi-object scene into its constituent objects and recompose them flexibly is desirable for object-oriented reasoning and planning. Built upon object masks in the pixel space, existing metrics for objectness can only evaluate generative models with an object-specific \u201cslot\u201d structure. We propose to directly measure compositionality in the representation space as a form of objections, making such evaluations tractable for a wider class of models. Our metric, COAT (Compositional Object Algebra Test), evaluates if a generic representation exhibits certain geometric properties that underpin object compositionality beyond what is already captured by the raw pixel space. Our experiments on the popular CLEVR (Johnson et.al., 2018) domain reveal that existing disentanglement-based generative models are not as compositional as one might expect, suggesting room for further modeling improvements. We hope our work allows for a unified evaluation of object-centric representations, spanning generative as well as discriminative, self-supervised models.}\n}", "pdf": "https://proceedings.mlr.press/v162/xie22b/xie22b.pdf", "supp": "", "pdf_size": 24428970, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12983702012681036190&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": "Department of Computer Science, UCLA + Fundamental AI Research (FAIR, Meta Inc.); Fundamental AI Research (FAIR, Meta Inc.); Department of Computer Science, UCLA + Department of Statistics, UCLA; Fundamental AI Research (FAIR, Meta Inc.)", "aff_domain": "ucla.edu; ; ; ", "email": "ucla.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/xie22b.html", "aff_unique_index": "0+1;1;0+0;1", "aff_unique_norm": "University of California, Los Angeles;Meta", "aff_unique_dep": "Department of Computer Science;Fundamental AI Research", "aff_unique_url": "https://www.ucla.edu;https://meta.com", "aff_unique_abbr": "UCLA;Meta", "aff_campus_unique_index": "0;0+0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0+0;0;0+0;0", "aff_country_unique": "United States" }, { "title": "COLA: Consistent Learning with Opponent-Learning Awareness", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16411", "id": "16411", "proceeding": "https://proceedings.mlr.press/v162/willi22a.html", "poster": "", "slides": "", "author_site": "Timon Willi, Alistair Letcher, Johannes Treutlein, Jakob Foerster", "author": "Timon Willi; Alistair Hp Letcher; Johannes Treutlein; Jakob Foerster", "abstract": "Learning in general-sum games is unstable and frequently leads to socially undesirable (Pareto-dominated) outcomes. To mitigate this, Learning with Opponent-Learning Awareness (LOLA) introduced opponent shaping to this setting, by accounting for each agent\u2019s influence on their opponents\u2019 anticipated learning steps. However, the original LOLA formulation (and follow-up work) is inconsistent because LOLA models other agents as naive learners rather than LOLA agents. In previous work, this inconsistency was suggested as a cause of LOLA\u2019s failure to preserve stable fixed points (SFPs). First, we formalize consistency and show that higher-order LOLA (HOLA) solves LOLA\u2019s inconsistency problem if it converges. Second, we correct a claim made in the literature by Sch{\u00e4}fer and Anandkumar (2019), proving that Competitive Gradient Descent (CGD) does not recover HOLA as a series expansion (and fails to solve the consistency problem). Third, we propose a new method called Consistent LOLA (COLA), which learns update functions that are consistent under mutual opponent shaping. It requires no more than second-order derivatives and learns consistent update functions even when HOLA fails to converge. However, we also prove that even consistent update functions do not preserve SFPs, contradicting the hypothesis that this shortcoming is caused by LOLA\u2019s inconsistency. Finally, in an empirical evaluation on a set of general-sum games, we find that COLA finds prosocial solutions and that it converges under a wider range of learning rates than HOLA and LOLA. We support the latter finding with a theoretical result for a simple game.", "bibtex": "@InProceedings{pmlr-v162-willi22a,\n title = \t {{COLA}: Consistent Learning with Opponent-Learning Awareness},\n author = {Willi, Timon and Letcher, Alistair Hp and Treutlein, Johannes and Foerster, Jakob},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23804--23831},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/willi22a/willi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/willi22a.html},\n abstract = \t {Learning in general-sum games is unstable and frequently leads to socially undesirable (Pareto-dominated) outcomes. To mitigate this, Learning with Opponent-Learning Awareness (LOLA) introduced opponent shaping to this setting, by accounting for each agent\u2019s influence on their opponents\u2019 anticipated learning steps. However, the original LOLA formulation (and follow-up work) is inconsistent because LOLA models other agents as naive learners rather than LOLA agents. In previous work, this inconsistency was suggested as a cause of LOLA\u2019s failure to preserve stable fixed points (SFPs). First, we formalize consistency and show that higher-order LOLA (HOLA) solves LOLA\u2019s inconsistency problem if it converges. Second, we correct a claim made in the literature by Sch{\u00e4}fer and Anandkumar (2019), proving that Competitive Gradient Descent (CGD) does not recover HOLA as a series expansion (and fails to solve the consistency problem). Third, we propose a new method called Consistent LOLA (COLA), which learns update functions that are consistent under mutual opponent shaping. It requires no more than second-order derivatives and learns consistent update functions even when HOLA fails to converge. However, we also prove that even consistent update functions do not preserve SFPs, contradicting the hypothesis that this shortcoming is caused by LOLA\u2019s inconsistency. Finally, in an empirical evaluation on a set of general-sum games, we find that COLA finds prosocial solutions and that it converges under a wider range of learning rates than HOLA and LOLA. We support the latter finding with a theoretical result for a simple game.}\n}", "pdf": "https://proceedings.mlr.press/v162/willi22a/willi22a.pdf", "supp": "", "pdf_size": 4403056, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14450342073245803366&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Engineering Science, University of Oxford, United Kingdom+Vector Institute, Toronto, Canada; Department of Engineering Science, University of Oxford, United Kingdom; Department of Computer Science, University of Toronto, Canada+Vector Institute, Toronto, Canada; Department of Engineering Science, University of Oxford, United Kingdom", "aff_domain": "eng.ox.ac.uk; ; ; ", "email": "eng.ox.ac.uk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/willi22a.html", "aff_unique_index": "0+1;0;2+1;0", "aff_unique_norm": "University of Oxford;Vector Institute;University of Toronto", "aff_unique_dep": "Department of Engineering Science;;Department of Computer Science", "aff_unique_url": "https://www.ox.ac.uk;https://vectorinstitute.ai;https://www.utoronto.ca", "aff_unique_abbr": "Oxford;Vector Institute;U of T", "aff_campus_unique_index": "0+1;0;1;0", "aff_campus_unique": "Oxford;Toronto;", "aff_country_unique_index": "0+1;0;1+1;0", "aff_country_unique": "United Kingdom;Canada" }, { "title": "Calibrated Learning to Defer with One-vs-All Classifiers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18123", "id": "18123", "proceeding": "https://proceedings.mlr.press/v162/verma22c.html", "poster": "/media/PosterPDFs/ICML%202022/024677efb8e4aee2eaeef17b54695bbe.png?t=1658072623.4170904", "slides": "", "author_site": "Rajeev Verma, Eric Nalisnick", "author": "Rajeev Verma; Eric Nalisnick", "abstract": "The learning to defer (L2D) framework has the potential to make AI systems safer. For a given input, the system can defer the decision to a human if the human is more likely than the model to take the correct action. We study the calibration of L2D systems, investigating if the probabilities they output are sound. We find that Mozannar & Sontag\u2019s (2020) multiclass framework is not calibrated with respect to expert correctness. Moreover, it is not even guaranteed to produce valid probabilities due to its parameterization being degenerate for this purpose. We propose an L2D system based on one-vs-all classifiers that is able to produce calibrated probabilities of expert correctness. Furthermore, our loss function is also a consistent surrogate for multiclass L2D, like Mozannar & Sontag\u2019s (2020). Our experiments verify that not only is our system calibrated, but this benefit comes at no cost to accuracy. Our model\u2019s accuracy is always comparable (and often superior) to Mozannar & Sontag\u2019s (2020) model\u2019s in tasks ranging from hate speech detection to galaxy classification to diagnosis of skin lesions.", "bibtex": "@InProceedings{pmlr-v162-verma22c,\n title = \t {Calibrated Learning to Defer with One-vs-All Classifiers},\n author = {Verma, Rajeev and Nalisnick, Eric},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22184--22202},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/verma22c/verma22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/verma22c.html},\n abstract = \t {The learning to defer (L2D) framework has the potential to make AI systems safer. For a given input, the system can defer the decision to a human if the human is more likely than the model to take the correct action. We study the calibration of L2D systems, investigating if the probabilities they output are sound. We find that Mozannar & Sontag\u2019s (2020) multiclass framework is not calibrated with respect to expert correctness. Moreover, it is not even guaranteed to produce valid probabilities due to its parameterization being degenerate for this purpose. We propose an L2D system based on one-vs-all classifiers that is able to produce calibrated probabilities of expert correctness. Furthermore, our loss function is also a consistent surrogate for multiclass L2D, like Mozannar & Sontag\u2019s (2020). Our experiments verify that not only is our system calibrated, but this benefit comes at no cost to accuracy. Our model\u2019s accuracy is always comparable (and often superior) to Mozannar & Sontag\u2019s (2020) model\u2019s in tasks ranging from hate speech detection to galaxy classification to diagnosis of skin lesions.}\n}", "pdf": "https://proceedings.mlr.press/v162/verma22c/verma22c.pdf", "supp": "", "pdf_size": 625905, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8829480964232923072&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Informatics Institute, University of Amsterdam, Amsterdam, Netherlands; Informatics Institute, University of Amsterdam, Amsterdam, Netherlands", "aff_domain": "gmail.com;uva.nl", "email": "gmail.com;uva.nl", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/verma22c.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Amsterdam", "aff_unique_dep": "Informatics Institute", "aff_unique_url": "https://www.uva.nl", "aff_unique_abbr": "UvA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Amsterdam", "aff_country_unique_index": "0;0", "aff_country_unique": "Netherlands" }, { "title": "Calibrated and Sharp Uncertainties in Deep Learning via Density Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17391", "id": "17391", "proceeding": "https://proceedings.mlr.press/v162/kuleshov22a.html", "poster": "/media/PosterPDFs/ICML%202022/3e91970f771a2c473ae36b60d1146068.png?t=1658353187.6082215", "slides": "", "author_site": "Volodymyr Kuleshov, Shachi Deshpande", "author": "Volodymyr Kuleshov; Shachi Deshpande", "abstract": "Accurate probabilistic predictions can be characterized by two properties{\u2014}calibration and sharpness. However, standard maximum likelihood training yields models that are poorly calibrated and thus inaccurate{\u2014}a 90% confidence interval typically does not contain the true outcome 90% of the time. This paper argues that calibration is important in practice and is easy to maintain by performing low-dimensional density estimation. We introduce a simple training procedure based on recalibration that yields calibrated models without sacrificing overall performance; unlike previous approaches, ours ensures the most general property of distribution calibration and applies to any model, including neural networks. We formally prove the correctness of our procedure assuming that we can estimate densities in low dimensions and we establish uniform convergence bounds. Our results yield empirical performance improvements on linear and deep Bayesian models and suggest that calibration should be increasingly leveraged across machine learning.", "bibtex": "@InProceedings{pmlr-v162-kuleshov22a,\n title = \t {Calibrated and Sharp Uncertainties in Deep Learning via Density Estimation},\n author = {Kuleshov, Volodymyr and Deshpande, Shachi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11683--11693},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kuleshov22a/kuleshov22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kuleshov22a.html},\n abstract = \t {Accurate probabilistic predictions can be characterized by two properties{\u2014}calibration and sharpness. However, standard maximum likelihood training yields models that are poorly calibrated and thus inaccurate{\u2014}a 90% confidence interval typically does not contain the true outcome 90% of the time. This paper argues that calibration is important in practice and is easy to maintain by performing low-dimensional density estimation. We introduce a simple training procedure based on recalibration that yields calibrated models without sacrificing overall performance; unlike previous approaches, ours ensures the most general property of distribution calibration and applies to any model, including neural networks. We formally prove the correctness of our procedure assuming that we can estimate densities in low dimensions and we establish uniform convergence bounds. Our results yield empirical performance improvements on linear and deep Bayesian models and suggest that calibration should be increasingly leveraged across machine learning.}\n}", "pdf": "https://proceedings.mlr.press/v162/kuleshov22a/kuleshov22a.pdf", "supp": "", "pdf_size": 1385107, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16568353107689337877&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Cornell Tech and Cornell University, New York, NY; Department of Computer Science, Cornell Tech and Cornell University, New York, NY", "aff_domain": "cornell.edu;cornell.edu", "email": "cornell.edu;cornell.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/kuleshov22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Cascaded Gaps: Towards Logarithmic Regret for Risk-Sensitive Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16487", "id": "16487", "proceeding": "https://proceedings.mlr.press/v162/fei22b.html", "poster": "", "slides": "", "author_site": "Yingjie Fei, Ruitu Xu", "author": "Yingjie Fei; Ruitu Xu", "abstract": "In this paper, we study gap-dependent regret guarantees for risk-sensitive reinforcement learning based on the entropic risk measure. We propose a novel definition of sub-optimality gaps, which we call cascaded gaps, and we discuss their key components that adapt to underlying structures of the problem. Based on the cascaded gaps, we derive non-asymptotic and logarithmic regret bounds for two model-free algorithms under episodic Markov decision processes. We show that, in appropriate settings, these bounds feature exponential improvement over existing ones that are independent of gaps. We also prove gap-dependent lower bounds, which certify the near optimality of the upper bounds.", "bibtex": "@InProceedings{pmlr-v162-fei22b,\n title = \t {Cascaded Gaps: Towards Logarithmic Regret for Risk-Sensitive Reinforcement Learning},\n author = {Fei, Yingjie and Xu, Ruitu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6392--6417},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fei22b/fei22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/fei22b.html},\n abstract = \t {In this paper, we study gap-dependent regret guarantees for risk-sensitive reinforcement learning based on the entropic risk measure. We propose a novel definition of sub-optimality gaps, which we call cascaded gaps, and we discuss their key components that adapt to underlying structures of the problem. Based on the cascaded gaps, we derive non-asymptotic and logarithmic regret bounds for two model-free algorithms under episodic Markov decision processes. We show that, in appropriate settings, these bounds feature exponential improvement over existing ones that are independent of gaps. We also prove gap-dependent lower bounds, which certify the near optimality of the upper bounds.}\n}", "pdf": "https://proceedings.mlr.press/v162/fei22b/fei22b.pdf", "supp": "", "pdf_size": 429042, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3607028092851969068&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": "Bloomberg, New York, USA; Department of Statistics and Data Science, Yale University, USA", "aff_domain": "cornell.edu; ", "email": "cornell.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/fei22b.html", "aff_unique_index": "0;1", "aff_unique_norm": "Bloomberg;Yale University", "aff_unique_dep": ";Department of Statistics and Data Science", "aff_unique_url": "https://www.bloomberg.com;https://www.yale.edu", "aff_unique_abbr": "Bloomberg;Yale", "aff_campus_unique_index": "0", "aff_campus_unique": "New York;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Causal Conceptions of Fairness and their Consequences", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17121", "id": "17121", "proceeding": "https://proceedings.mlr.press/v162/nilforoshan22a.html", "poster": "", "slides": "", "author_site": "Hamed Nilforoshan, Johann Gaebler, Ravi Shroff, Sharad Goel", "author": "Hamed Nilforoshan; Johann D Gaebler; Ravi Shroff; Sharad Goel", "abstract": "Recent work highlights the role of causality in designing equitable decision-making algorithms. It is not immediately clear, however, how existing causal conceptions of fairness relate to one another, or what the consequences are of using these definitions as design principles. Here, we first assemble and categorize popular causal definitions of algorithmic fairness into two broad families: (1) those that constrain the effects of decisions on counterfactual disparities; and (2) those that constrain the effects of legally protected characteristics, like race and gender, on decisions. We then show, analytically and empirically, that both families of definitions", "bibtex": "@InProceedings{pmlr-v162-nilforoshan22a,\n title = \t {Causal Conceptions of Fairness and their Consequences},\n author = {Nilforoshan, Hamed and Gaebler, Johann D and Shroff, Ravi and Goel, Sharad},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16848--16887},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nilforoshan22a/nilforoshan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nilforoshan22a.html},\n abstract = \t {Recent work highlights the role of causality in designing equitable decision-making algorithms. It is not immediately clear, however, how existing causal conceptions of fairness relate to one another, or what the consequences are of using these definitions as design principles. Here, we first assemble and categorize popular causal definitions of algorithmic fairness into two broad families: (1) those that constrain the effects of decisions on counterfactual disparities; and (2) those that constrain the effects of legally protected characteristics, like race and gender, on decisions. We then show, analytically and empirically, that both families of definitions", "pdf": "https://proceedings.mlr.press/v162/nilforoshan22a/nilforoshan22a.pdf", "supp": "", "pdf_size": 765560, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1541736830305704043&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "aff": "Stanford University; Stanford University; New York University; Harvard University", "aff_domain": "cs.stanford.edu;stanford.edu;nyu.edu;hks.harvard.edu", "email": "cs.stanford.edu;stanford.edu;nyu.edu;hks.harvard.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/nilforoshan22a.html", "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Stanford University;New York University;Harvard University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.stanford.edu;https://www.nyu.edu;https://www.harvard.edu", "aff_unique_abbr": "Stanford;NYU;Harvard", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Causal Dynamics Learning for Task-Independent State Abstraction", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16839", "id": "16839", "proceeding": "https://proceedings.mlr.press/v162/wang22ae.html", "poster": "/media/PosterPDFs/ICML%202022/f19ec2b84181033bf4753a5a51d5d608.png?t=1657650726.8958068", "slides": "", "author_site": "Zizhao Wang, Xuesu Xiao, Zifan Xu, Yuke Zhu, Peter Stone", "author": "Zizhao Wang; Xuesu Xiao; Zifan Xu; Yuke Zhu; Peter Stone", "abstract": "Learning dynamics models accurately is an important goal for Model-Based Reinforcement Learning (MBRL), but most MBRL methods learn a dense dynamics model which is vulnerable to spurious correlations and therefore generalizes poorly to unseen states. In this paper, we introduce Causal Dynamics Learning for Task-Independent State Abstraction (CDL), which first learns a theoretically proved causal dynamics model that removes unnecessary dependencies between state variables and the action, thus generalizing well to unseen states. A state abstraction can then be derived from the learned dynamics, which not only improves sample efficiency but also applies to a wider range of tasks than existing state abstraction methods. Evaluated on two simulated environments and downstream tasks, both the dynamics model and policies learned by the proposed method generalize well to unseen states and the derived state abstraction improves sample efficiency compared to learning without it.", "bibtex": "@InProceedings{pmlr-v162-wang22ae,\n title = \t {Causal Dynamics Learning for Task-Independent State Abstraction},\n author = {Wang, Zizhao and Xiao, Xuesu and Xu, Zifan and Zhu, Yuke and Stone, Peter},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23151--23180},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22ae/wang22ae.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22ae.html},\n abstract = \t {Learning dynamics models accurately is an important goal for Model-Based Reinforcement Learning (MBRL), but most MBRL methods learn a dense dynamics model which is vulnerable to spurious correlations and therefore generalizes poorly to unseen states. In this paper, we introduce Causal Dynamics Learning for Task-Independent State Abstraction (CDL), which first learns a theoretically proved causal dynamics model that removes unnecessary dependencies between state variables and the action, thus generalizing well to unseen states. A state abstraction can then be derived from the learned dynamics, which not only improves sample efficiency but also applies to a wider range of tasks than existing state abstraction methods. Evaluated on two simulated environments and downstream tasks, both the dynamics model and policies learned by the proposed method generalize well to unseen states and the derived state abstraction improves sample efficiency compared to learning without it.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22ae/wang22ae.pdf", "supp": "", "pdf_size": 24138531, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7092132108841275612&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "aff": "Department of Electrical and Computer Engineering; Department of Computer Science, The University of Texas at Austin; Department of Computer Science, The University of Texas at Austin; Department of Computer Science, The University of Texas at Austin; Department of Computer Science, The University of Texas at Austin + Sony AI", "aff_domain": "utexas.edu; ; ; ; ", "email": "utexas.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/wang22ae.html", "aff_unique_index": "0;1;1;1;1+2", "aff_unique_norm": "Unknown Institution;University of Texas at Austin;Sony", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Computer Science;Sony AI", "aff_unique_url": ";https://www.utexas.edu;https://www.sony.com", "aff_unique_abbr": ";UT Austin;Sony AI", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "1;1;1;1+2", "aff_country_unique": ";United States;Japan" }, { "title": "Causal Imitation Learning under Temporally Correlated Noise", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18069", "id": "18069", "proceeding": "https://proceedings.mlr.press/v162/swamy22a.html", "poster": "/media/PosterPDFs/ICML%202022/f0b1d5879866f2c2eba77f39993d1184.png?t=1657662620.3236117", "slides": "/media/icml-2022/Slides/18069.pdf", "author_site": "Gokul Swamy, Sanjiban Choudhury, James Bagnell, Steven Wu", "author": "Gokul Swamy; Sanjiban Choudhury; Drew Bagnell; Steven Wu", "abstract": "We develop algorithms for imitation learning from policy data that was corrupted by temporally correlated noise in expert actions. When noise affects multiple timesteps of recorded data, it can manifest as spurious correlations between states and actions that a learner might latch on to, leading to poor policy performance. To break up these spurious correlations, we apply modern variants of the instrumental variable regression (IVR) technique of econometrics, enabling us to recover the underlying policy without requiring access to an interactive expert. In particular, we present two techniques, one of a generative-modeling flavor (DoubIL) that can utilize access to a simulator, and one of a game-theoretic flavor (ResiduIL) that can be run entirely offline. We find both of our algorithms compare favorably to behavioral cloning on simulated control tasks.", "bibtex": "@InProceedings{pmlr-v162-swamy22a,\n title = \t {Causal Imitation Learning under Temporally Correlated Noise},\n author = {Swamy, Gokul and Choudhury, Sanjiban and Bagnell, Drew and Wu, Steven},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20877--20890},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/swamy22a/swamy22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/swamy22a.html},\n abstract = \t {We develop algorithms for imitation learning from policy data that was corrupted by temporally correlated noise in expert actions. When noise affects multiple timesteps of recorded data, it can manifest as spurious correlations between states and actions that a learner might latch on to, leading to poor policy performance. To break up these spurious correlations, we apply modern variants of the instrumental variable regression (IVR) technique of econometrics, enabling us to recover the underlying policy without requiring access to an interactive expert. In particular, we present two techniques, one of a generative-modeling flavor (DoubIL) that can utilize access to a simulator, and one of a game-theoretic flavor (ResiduIL) that can be run entirely offline. We find both of our algorithms compare favorably to behavioral cloning on simulated control tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/swamy22a/swamy22a.pdf", "supp": "", "pdf_size": 700409, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3778588231646817630&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Carnegie Mellon University; Cornell University; Aurora Innovation; Carnegie Mellon University", "aff_domain": "cmu.edu; ; ; ", "email": "cmu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/swamy22a.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Carnegie Mellon University;Cornell University;Aurora Innovation", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.cornell.edu;https://aurora.tech", "aff_unique_abbr": "CMU;Cornell;Aurora", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Causal Inference Through the Structural Causal Marginal Problem", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17833", "id": "17833", "proceeding": "https://proceedings.mlr.press/v162/gresele22a.html", "poster": "/media/PosterPDFs/ICML%202022/36452e720502e4da486d2f9f6b48a7bb.png?t=1657802227.4901776", "slides": "", "author_site": "Luigi Gresele, Julius von K\u00fcgelgen, Jonas K\u00fcbler, Elke Kirschbaum, Bernhard Sch\u00f6lkopf, Dominik Janzing", "author": "Luigi Gresele; Julius Von K\u00fcgelgen; Jonas K\u00fcbler; Elke Kirschbaum; Bernhard Sch\u00f6lkopf; Dominik Janzing", "abstract": "We introduce an approach to counterfactual inference based on merging information from multiple datasets. We consider a causal reformulation of the statistical marginal problem: given a collection of marginal structural causal models (SCMs) over distinct but overlapping sets of variables, determine the set of joint SCMs that are counterfactually consistent with the marginal ones. We formalise this approach for categorical SCMs using the response function formulation and show that it reduces the space of allowed marginal and joint SCMs. Our work thus highlights a new mode of falsifiability through additional variables, in contrast to the statistical one via additional data.", "bibtex": "@InProceedings{pmlr-v162-gresele22a,\n title = \t {Causal Inference Through the Structural Causal Marginal Problem},\n author = {Gresele, Luigi and K{\\\"u}gelgen, Julius Von and K{\\\"u}bler, Jonas and Kirschbaum, Elke and Sch{\\\"o}lkopf, Bernhard and Janzing, Dominik},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7793--7824},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gresele22a/gresele22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gresele22a.html},\n abstract = \t {We introduce an approach to counterfactual inference based on merging information from multiple datasets. We consider a causal reformulation of the statistical marginal problem: given a collection of marginal structural causal models (SCMs) over distinct but overlapping sets of variables, determine the set of joint SCMs that are counterfactually consistent with the marginal ones. We formalise this approach for categorical SCMs using the response function formulation and show that it reduces the space of allowed marginal and joint SCMs. Our work thus highlights a new mode of falsifiability through additional variables, in contrast to the statistical one via additional data.}\n}", "pdf": "https://proceedings.mlr.press/v162/gresele22a/gresele22a.pdf", "supp": "", "pdf_size": 604760, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2256399104999533783&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Max Planck Institute for Intelligent Systems, Tubingen, Germany+University of Cambridge, Cambridge, United Kingdom; Max Planck Institute for Intelligent Systems, Tubingen, Germany+University of Cambridge, Cambridge, United Kingdom; Max Planck Institute for Intelligent Systems, Tubingen, Germany; Amazon Research, Tubingen, Germany; Max Planck Institute for Intelligent Systems, Tubingen, Germany; Amazon Research, Tubingen, Germany", "aff_domain": "tue.mpg.de; ; ; ; ; ", "email": "tue.mpg.de; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/gresele22a.html", "aff_unique_index": "0+1;0+1;0;2;0;2", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of Cambridge;Amazon", "aff_unique_dep": ";;Amazon Research", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.cam.ac.uk;https://www.amazon.science", "aff_unique_abbr": "MPI-IS;Cambridge;Amazon Research", "aff_campus_unique_index": "0+1;0+1;0;0;0;0", "aff_campus_unique": "Tubingen;Cambridge", "aff_country_unique_index": "0+1;0+1;0;0;0;0", "aff_country_unique": "Germany;United Kingdom" }, { "title": "Causal Transformer for Estimating Counterfactual Outcomes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17693", "id": "17693", "proceeding": "https://proceedings.mlr.press/v162/melnychuk22a.html", "poster": "/media/PosterPDFs/ICML%202022/ada5e0b63ef60e2239fa8abdd4aa2f8e.png?t=1657902801.7339456", "slides": "/media/icml-2022/Slides/17693.pdf", "author_site": "Valentyn Melnychuk, Dennis Frauen, Stefan Feuerriegel", "author": "Valentyn Melnychuk; Dennis Frauen; Stefan Feuerriegel", "abstract": "Estimating counterfactual outcomes over time from observational data is relevant for many applications (e.g., personalized medicine). Yet, state-of-the-art methods build upon simple long short-term memory (LSTM) networks, thus rendering inferences for complex, long-range dependencies challenging. In this paper, we develop a novel Causal Transformer for estimating counterfactual outcomes over time. Our model is specifically designed to capture complex, long-range dependencies among time-varying confounders. For this, we combine three transformer subnetworks with separate inputs for time-varying covariates, previous treatments, and previous outcomes into a joint network with in-between cross-attentions. We further develop a custom, end-to-end training procedure for our Causal Transformer. Specifically, we propose a novel counterfactual domain confusion loss to address confounding bias: it aims to learn adversarial balanced representations, so that they are predictive of the next outcome but non-predictive of the current treatment assignment. We evaluate our Causal Transformer based on synthetic and real-world datasets, where it achieves superior performance over current baselines. To the best of our knowledge, this is the first work proposing transformer-based architecture for estimating counterfactual outcomes from longitudinal data.", "bibtex": "@InProceedings{pmlr-v162-melnychuk22a,\n title = \t {Causal Transformer for Estimating Counterfactual Outcomes},\n author = {Melnychuk, Valentyn and Frauen, Dennis and Feuerriegel, Stefan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15293--15329},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/melnychuk22a/melnychuk22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/melnychuk22a.html},\n abstract = \t {Estimating counterfactual outcomes over time from observational data is relevant for many applications (e.g., personalized medicine). Yet, state-of-the-art methods build upon simple long short-term memory (LSTM) networks, thus rendering inferences for complex, long-range dependencies challenging. In this paper, we develop a novel Causal Transformer for estimating counterfactual outcomes over time. Our model is specifically designed to capture complex, long-range dependencies among time-varying confounders. For this, we combine three transformer subnetworks with separate inputs for time-varying covariates, previous treatments, and previous outcomes into a joint network with in-between cross-attentions. We further develop a custom, end-to-end training procedure for our Causal Transformer. Specifically, we propose a novel counterfactual domain confusion loss to address confounding bias: it aims to learn adversarial balanced representations, so that they are predictive of the next outcome but non-predictive of the current treatment assignment. We evaluate our Causal Transformer based on synthetic and real-world datasets, where it achieves superior performance over current baselines. To the best of our knowledge, this is the first work proposing transformer-based architecture for estimating counterfactual outcomes from longitudinal data.}\n}", "pdf": "https://proceedings.mlr.press/v162/melnychuk22a/melnychuk22a.pdf", "supp": "", "pdf_size": 1208872, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15562561940840223837&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "LMU Munich; LMU Munich; LMU Munich", "aff_domain": "lmu.de; ; ", "email": "lmu.de; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/melnychuk22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Ludwig Maximilian University of Munich", "aff_unique_dep": "", "aff_unique_url": "https://www.lmu.de", "aff_unique_abbr": "LMU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Munich", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Causal structure-based root cause analysis of outliers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17527", "id": "17527", "proceeding": "https://proceedings.mlr.press/v162/budhathoki22a.html", "poster": "/media/PosterPDFs/ICML%202022/dbbf603ff0e99629dda5d75b6f75f966.png?t=1657569584.1173372", "slides": "", "author_site": "Kailash Budhathoki, Lenon Minorics, Patrick Bloebaum, Dominik Janzing", "author": "Kailash Budhathoki; Lenon Minorics; Patrick Bloebaum; Dominik Janzing", "abstract": "Current techniques for explaining outliers cannot tell what caused the outliers. We present a formal method to identify \"root causes\" of outliers, amongst variables. The method requires a causal graph of the variables along with the functional causal model. It quantifies the contribution of each variable to the target outlier score, which explains to what extent each variable is a \"root cause\" of the target outlier. We study the empirical performance of the method through simulations and present a real-world case study identifying \"root causes\" of extreme river flows.", "bibtex": "@InProceedings{pmlr-v162-budhathoki22a,\n title = \t {Causal structure-based root cause analysis of outliers},\n author = {Budhathoki, Kailash and Minorics, Lenon and Bloebaum, Patrick and Janzing, Dominik},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2357--2369},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/budhathoki22a/budhathoki22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/budhathoki22a.html},\n abstract = \t {Current techniques for explaining outliers cannot tell what caused the outliers. We present a formal method to identify \"root causes\" of outliers, amongst variables. The method requires a causal graph of the variables along with the functional causal model. It quantifies the contribution of each variable to the target outlier score, which explains to what extent each variable is a \"root cause\" of the target outlier. We study the empirical performance of the method through simulations and present a real-world case study identifying \"root causes\" of extreme river flows.}\n}", "pdf": "https://proceedings.mlr.press/v162/budhathoki22a/budhathoki22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/budhathoki22a-supp.zip", "pdf_size": 724959, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7222731826584855884&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Amazon Research T\u00fcbingen; Amazon Research T\u00fcbingen; Amazon Research T\u00fcbingen; Amazon Research T\u00fcbingen", "aff_domain": "amazon.com; ; ; ", "email": "amazon.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/budhathoki22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Amazon", "aff_unique_dep": "Amazon Research", "aff_unique_url": "https://www.amazon.com", "aff_unique_abbr": "Amazon", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "T\u00fcbingen", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Centroid Approximation for Bootstrap: Improving Particle Quality at Inference", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16013", "id": "16013", "proceeding": "https://proceedings.mlr.press/v162/ye22a.html", "poster": "", "slides": "", "author_site": "Mao Ye, Qiang Liu", "author": "Mao Ye; Qiang Liu", "abstract": "Bootstrap is a principled and powerful frequentist statistical tool for uncertainty quantification. Unfortunately, standard bootstrap methods are computationally intensive due to the need of drawing a large i.i.d. bootstrap sample to approximate the ideal bootstrap distribution; this largely hinders their application in large-scale machine learning, especially deep learning problems. In this work, we propose an efficient method to explicitly", "bibtex": "@InProceedings{pmlr-v162-ye22a,\n title = \t {Centroid Approximation for Bootstrap: Improving Particle Quality at Inference},\n author = {Ye, Mao and Liu, Qiang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25469--25489},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ye22a/ye22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ye22a.html},\n abstract = \t {Bootstrap is a principled and powerful frequentist statistical tool for uncertainty quantification. Unfortunately, standard bootstrap methods are computationally intensive due to the need of drawing a large i.i.d. bootstrap sample to approximate the ideal bootstrap distribution; this largely hinders their application in large-scale machine learning, especially deep learning problems. In this work, we propose an efficient method to explicitly", "pdf": "https://proceedings.mlr.press/v162/ye22a/ye22a.pdf", "supp": "", "pdf_size": 990964, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CRTrLK0OUsoJ:scholar.google.com/&scioq=Centroid+Approximation+for+Bootstrap:+Improving+Particle+Quality+at+Inference&hl=en&as_sdt=0,33", "gs_version_total": 4, "aff": "Department of Computer Science, University of Texas at Austin; Department of Computer Science, University of Texas at Austin", "aff_domain": "utexas.edu; ", "email": "utexas.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/ye22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "CerDEQ: Certifiable Deep Equilibrium Model", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17433", "id": "17433", "proceeding": "https://proceedings.mlr.press/v162/li22t.html", "poster": "/media/PosterPDFs/ICML%202022/a9eb812238f753132652ae09963a05e9.png?t=1658048436.8433402", "slides": "", "author_site": "Mingjie Li, Yisen Wang, Zhouchen Lin", "author": "Mingjie Li; Yisen Wang; Zhouchen Lin", "abstract": "Recently, certifiable robust training methods via bound propagation have been proposed for training neural networks with certifiable robustness guarantees. However, no neural architectures with regular convolution and linear layers perform better in the certifiable training than the plain CNNs, since the output bounds for the deep explicit models increase quickly as their depth increases. And such a phenomenon significantly hinders certifiable training. Meanwhile, the Deep Equilibrium Model\u00a0(DEQ) is more representative and robust due to their equivalent infinite depth and controllable global Lipschitz. But no work has been proposed to explore whether DEQ can show advantages in certified training. In this work, we aim to tackle the problem of DEQ\u2019s certified training. To obtain the output bound based on the bound propagation scheme in the implicit model, we first involve the adjoint DEQ for bound approximation. Furthermore, we also use the weight orthogonalization method and other tricks specified for DEQ to stabilize the certifiable training. With our approach, we can obtain the certifiable DEQ called CerDEQ. Our CerDEQ can achieve state-of-the-art performance compared with models using regular convolution and linear layers on $\\ell_\\infty$ tasks with $\\epsilon=8/255$: $64.72%$ certified error for CIFAR-$10$ and $94.45%$ certified error for Tiny ImageNet.", "bibtex": "@InProceedings{pmlr-v162-li22t,\n title = \t {{C}er{DEQ}: Certifiable Deep Equilibrium Model},\n author = {Li, Mingjie and Wang, Yisen and Lin, Zhouchen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12998--13013},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22t/li22t.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22t.html},\n abstract = \t {Recently, certifiable robust training methods via bound propagation have been proposed for training neural networks with certifiable robustness guarantees. However, no neural architectures with regular convolution and linear layers perform better in the certifiable training than the plain CNNs, since the output bounds for the deep explicit models increase quickly as their depth increases. And such a phenomenon significantly hinders certifiable training. Meanwhile, the Deep Equilibrium Model\u00a0(DEQ) is more representative and robust due to their equivalent infinite depth and controllable global Lipschitz. But no work has been proposed to explore whether DEQ can show advantages in certified training. In this work, we aim to tackle the problem of DEQ\u2019s certified training. To obtain the output bound based on the bound propagation scheme in the implicit model, we first involve the adjoint DEQ for bound approximation. Furthermore, we also use the weight orthogonalization method and other tricks specified for DEQ to stabilize the certifiable training. With our approach, we can obtain the certifiable DEQ called CerDEQ. Our CerDEQ can achieve state-of-the-art performance compared with models using regular convolution and linear layers on $\\ell_\\infty$ tasks with $\\epsilon=8/255$: $64.72%$ certified error for CIFAR-$10$ and $94.45%$ certified error for Tiny ImageNet.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22t/li22t.pdf", "supp": "", "pdf_size": 504380, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff": "Key Lab. of Machine Perception (MoE), School of Artificial Intelligence, Peking University + Institute for Artificial Intelligence, Peking University + Peng Cheng Laboratory; Key Lab. of Machine Perception (MoE), School of Artificial Intelligence, Peking University + Institute for Artificial Intelligence, Peking University + Peng Cheng Laboratory; Key Lab. of Machine Perception (MoE), School of Artificial Intelligence, Peking University + Institute for Artificial Intelligence, Peking University + Peng Cheng Laboratory", "aff_domain": "pku.edu.cn;pku.edu.cn;pku.edu.cn", "email": "pku.edu.cn;pku.edu.cn;pku.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/li22t.html", "aff_unique_index": "0+0+1;0+0+1;0+0+1", "aff_unique_norm": "Peking University;Pengcheng Laboratory", "aff_unique_dep": "School of Artificial Intelligence;Peng Cheng Laboratory", "aff_unique_url": "http://www.pku.edu.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "PKU;PCL", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0", "aff_country_unique": "China" }, { "title": "Certified Adversarial Robustness Under the Bounded Support Set", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17477", "id": "17477", "proceeding": "https://proceedings.mlr.press/v162/kou22a.html", "poster": "/media/PosterPDFs/ICML%202022/5a0c828364dbf6dd406139dab7b25398_FGai07Z.png?t=1657259078.8016317", "slides": "", "author_site": "Yiwen Kou, Qinyuan Zheng, Yisen Wang", "author": "Yiwen Kou; Qinyuan Zheng; Yisen Wang", "abstract": "Deep neural networks (DNNs) have revealed severe vulnerability to adversarial perturbations, beside empirical adversarial training for robustness, the design of provably robust classifiers attracts more and more attention. Randomized smoothing methods provide the certified robustness with agnostic architecture, which is further extended to a provable robustness framework using f-divergence. While these methods cannot be applied to smoothing measures with bounded support set such as uniform probability measure due to the use of likelihood ratio in their certification methods. In this paper, we generalize the $f$-divergence-based framework to a Wasserstein-distance-based and total-variation-distance-based framework that is first able to analyze robustness properties of bounded support set smoothing measures both theoretically and experimentally. By applying our methodology to uniform probability measures with support set $l_p (p=1,2,\\infty\\text{ and general})$ ball, we prove negative certified robustness properties with respect to $l_q (q=1, 2, \\infty)$ perturbations and present experimental results on CIFAR-10 dataset with ResNet to validate our theory. And it is also worth mentioning that our certification procedure only costs constant computation time.", "bibtex": "@InProceedings{pmlr-v162-kou22a,\n title = \t {Certified Adversarial Robustness Under the Bounded Support Set},\n author = {Kou, Yiwen and Zheng, Qinyuan and Wang, Yisen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11559--11597},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kou22a/kou22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kou22a.html},\n abstract = \t {Deep neural networks (DNNs) have revealed severe vulnerability to adversarial perturbations, beside empirical adversarial training for robustness, the design of provably robust classifiers attracts more and more attention. Randomized smoothing methods provide the certified robustness with agnostic architecture, which is further extended to a provable robustness framework using f-divergence. While these methods cannot be applied to smoothing measures with bounded support set such as uniform probability measure due to the use of likelihood ratio in their certification methods. In this paper, we generalize the $f$-divergence-based framework to a Wasserstein-distance-based and total-variation-distance-based framework that is first able to analyze robustness properties of bounded support set smoothing measures both theoretically and experimentally. By applying our methodology to uniform probability measures with support set $l_p (p=1,2,\\infty\\text{ and general})$ ball, we prove negative certified robustness properties with respect to $l_q (q=1, 2, \\infty)$ perturbations and present experimental results on CIFAR-10 dataset with ResNet to validate our theory. And it is also worth mentioning that our certification procedure only costs constant computation time.}\n}", "pdf": "https://proceedings.mlr.press/v162/kou22a/kou22a.pdf", "supp": "", "pdf_size": 1911422, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14521301205175113544&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Yuanpei College, Peking University; Key Lab. of Machine Perception (MoE), School of Artificial Intelligence, Peking University; Institute for Artificial Intelligence, Peking University", "aff_domain": "pku.edu.cn; ; ", "email": "pku.edu.cn; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kou22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "Yuanpei College", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "PKU", "aff_campus_unique_index": "0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Certified Neural Network Watermarks with Randomized Smoothing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16357", "id": "16357", "proceeding": "https://proceedings.mlr.press/v162/bansal22a.html", "poster": "/media/PosterPDFs/ICML%202022/1731592aca5fb4d789c4119c65c10b4b_9u94mXt.png?t=1657812374.5769374", "slides": "", "author_site": "Arpit Bansal, Ping-yeh Chiang, Michael Curry, Rajiv Jain, Curtis Wigington, Varun Manjunatha, John P Dickerson, Tom Goldstein", "author": "Arpit Bansal; Ping-Yeh Chiang; Michael J Curry; Rajiv Jain; Curtis Wigington; Varun Manjunatha; John P Dickerson; Tom Goldstein", "abstract": "Watermarking is a commonly used strategy to protect creators\u2019 rights to digital images, videos and audio. Recently, watermarking methods have been extended to deep learning models \u2013 in principle, the watermark should be preserved when an adversary tries to copy the model. However, in practice, watermarks can often be removed by an intelligent adversary. Several papers have proposed watermarking methods that claim to be empirically resistant to different types of removal attacks, but these new techniques often fail in the face of new or better-tuned adversaries. In this paper, we propose the first", "bibtex": "@InProceedings{pmlr-v162-bansal22a,\n title = \t {Certified Neural Network Watermarks with Randomized Smoothing},\n author = {Bansal, Arpit and Chiang, Ping-Yeh and Curry, Michael J and Jain, Rajiv and Wigington, Curtis and Manjunatha, Varun and Dickerson, John P and Goldstein, Tom},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1450--1465},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bansal22a/bansal22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bansal22a.html},\n abstract = \t {Watermarking is a commonly used strategy to protect creators\u2019 rights to digital images, videos and audio. Recently, watermarking methods have been extended to deep learning models \u2013 in principle, the watermark should be preserved when an adversary tries to copy the model. However, in practice, watermarks can often be removed by an intelligent adversary. Several papers have proposed watermarking methods that claim to be empirically resistant to different types of removal attacks, but these new techniques often fail in the face of new or better-tuned adversaries. In this paper, we propose the first", "pdf": "https://proceedings.mlr.press/v162/bansal22a/bansal22a.pdf", "supp": "", "pdf_size": 722145, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2567091061635643130&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "University of Maryland, College Park; University of Maryland, College Park; University of Maryland, College Park; Adobe Research, USA; Adobe Research, USA; Adobe Research, USA; University of Maryland, College Park; University of Maryland, College Park", "aff_domain": "umd.edu;cs.umd.edu; ; ; ; ; ; ", "email": "umd.edu;cs.umd.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/bansal22a.html", "aff_unique_index": "0;0;0;1;1;1;0;0", "aff_unique_norm": "University of Maryland;Adobe", "aff_unique_dep": ";Adobe Research", "aff_unique_url": "https://www/umd.edu;https://research.adobe.com", "aff_unique_abbr": "UMD;Adobe", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Certified Robustness Against Natural Language Attacks by Causal Intervention", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17857", "id": "17857", "proceeding": "https://proceedings.mlr.press/v162/zhao22g.html", "poster": "", "slides": "", "author_site": "Haiteng Zhao, Chang Ma, Xinshuai Dong, Anh Tuan Luu, Zhi-Hong Deng, Hanwang Zhang", "author": "Haiteng Zhao; Chang Ma; Xinshuai Dong; Anh Tuan Luu; Zhi-Hong Deng; Hanwang Zhang", "abstract": "Deep learning models have achieved great success in many fields, yet they are vulnerable to adversarial examples. This paper follows a causal perspective to look into the adversarial vulnerability and proposes Causal Intervention by Semantic Smoothing (CISS), a novel framework towards robustness against natural language attacks. Instead of merely fitting observational data, CISS learns causal effects p(y|do(x)) by smoothing in the latent semantic space to make robust predictions, which scales to deep architectures and avoids tedious construction of noise customized for specific attacks. CISS is provably robust against word substitution attacks, as well as empirically robust even when perturbations are strengthened by unknown attack algorithms. For example, on YELP, CISS surpasses the runner-up by 6.8% in terms of certified robustness against word substitutions, and achieves 80.7% empirical robustness when syntactic attacks are integrated.", "bibtex": "@InProceedings{pmlr-v162-zhao22g,\n title = \t {Certified Robustness Against Natural Language Attacks by Causal Intervention},\n author = {Zhao, Haiteng and Ma, Chang and Dong, Xinshuai and Luu, Anh Tuan and Deng, Zhi-Hong and Zhang, Hanwang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26958--26970},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhao22g/zhao22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhao22g.html},\n abstract = \t {Deep learning models have achieved great success in many fields, yet they are vulnerable to adversarial examples. This paper follows a causal perspective to look into the adversarial vulnerability and proposes Causal Intervention by Semantic Smoothing (CISS), a novel framework towards robustness against natural language attacks. Instead of merely fitting observational data, CISS learns causal effects p(y|do(x)) by smoothing in the latent semantic space to make robust predictions, which scales to deep architectures and avoids tedious construction of noise customized for specific attacks. CISS is provably robust against word substitution attacks, as well as empirically robust even when perturbations are strengthened by unknown attack algorithms. For example, on YELP, CISS surpasses the runner-up by 6.8% in terms of certified robustness against word substitutions, and achieves 80.7% empirical robustness when syntactic attacks are integrated.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhao22g/zhao22g.pdf", "supp": "", "pdf_size": 1664329, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16167491038280669708&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Peking University; Peking University; Carnegie Mellon University; Nanyang Technological University+Peking University; Peking University; Nanyang Technological University", "aff_domain": "pku.edu.cn;pku.edu.cn;cs.cmu.edu;ntu.edu.sg;pku.edu.cn;ntu.edu.sg", "email": "pku.edu.cn;pku.edu.cn;cs.cmu.edu;ntu.edu.sg;pku.edu.cn;ntu.edu.sg", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhao22g.html", "aff_unique_index": "0;0;1;2+0;0;2", "aff_unique_norm": "Peking University;Carnegie Mellon University;Nanyang Technological University", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.cmu.edu;https://www.ntu.edu.sg", "aff_unique_abbr": "Peking U;CMU;NTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;2+0;0;2", "aff_country_unique": "China;United States;Singapore" }, { "title": "Certifying Out-of-Domain Generalization for Blackbox Functions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17665", "id": "17665", "proceeding": "https://proceedings.mlr.press/v162/weber22a.html", "poster": "/media/PosterPDFs/ICML%202022/20ba7f85c05c5e5b75abced9ece67ac9_fX8gwzL.png?t=1657874026.6169696", "slides": "", "author_site": "Maurice Weber, Linyi Li, Boxin Wang, Zhikuan Zhao, Bo Li, Ce Zhang", "author": "Maurice G Weber; Linyi Li; Boxin Wang; Zhikuan Zhao; Bo Li; Ce Zhang", "abstract": "Certifying the robustness of model performance under bounded data distribution drifts has recently attracted intensive interest under the umbrella of distributional robustness. However, existing techniques either make strong assumptions on the model class and loss functions that can be certified, such as smoothness expressed via Lipschitz continuity of gradients, or require to solve complex optimization problems. As a result, the wider application of these techniques is currently limited by its scalability and flexibility \u2014 these techniques often do not scale to large-scale datasets with modern deep neural networks or cannot handle loss functions which may be non-smooth such as the 0-1 loss. In this paper, we focus on the problem of certifying distributional robustness for blackbox models and bounded loss functions, and propose a novel certification framework based on the Hellinger distance. Our certification technique scales to ImageNet-scale datasets, complex models, and a diverse set of loss functions. We then focus on one specific application enabled by such scalability and flexibility, i.e., certifying out-of-domain generalization for large neural networks and loss functions such as accuracy and AUC. We experimentally validate our certification method on a number of datasets, ranging from ImageNet, where we provide the first non-vacuous certified out-of-domain generalization, to smaller classification tasks where we are able to compare with the state-of-the-art and show that our method performs considerably better.", "bibtex": "@InProceedings{pmlr-v162-weber22a,\n title = \t {Certifying Out-of-Domain Generalization for Blackbox Functions},\n author = {Weber, Maurice G and Li, Linyi and Wang, Boxin and Zhao, Zhikuan and Li, Bo and Zhang, Ce},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23527--23548},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/weber22a/weber22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/weber22a.html},\n abstract = \t {Certifying the robustness of model performance under bounded data distribution drifts has recently attracted intensive interest under the umbrella of distributional robustness. However, existing techniques either make strong assumptions on the model class and loss functions that can be certified, such as smoothness expressed via Lipschitz continuity of gradients, or require to solve complex optimization problems. As a result, the wider application of these techniques is currently limited by its scalability and flexibility \u2014 these techniques often do not scale to large-scale datasets with modern deep neural networks or cannot handle loss functions which may be non-smooth such as the 0-1 loss. In this paper, we focus on the problem of certifying distributional robustness for blackbox models and bounded loss functions, and propose a novel certification framework based on the Hellinger distance. Our certification technique scales to ImageNet-scale datasets, complex models, and a diverse set of loss functions. We then focus on one specific application enabled by such scalability and flexibility, i.e., certifying out-of-domain generalization for large neural networks and loss functions such as accuracy and AUC. We experimentally validate our certification method on a number of datasets, ranging from ImageNet, where we provide the first non-vacuous certified out-of-domain generalization, to smaller classification tasks where we are able to compare with the state-of-the-art and show that our method performs considerably better.}\n}", "pdf": "https://proceedings.mlr.press/v162/weber22a/weber22a.pdf", "supp": "", "pdf_size": 1870524, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5540253257951212310&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, ETH Zurich; UIUC, USA; UIUC, USA; Department of Computer Science, ETH Zurich; UIUC, USA; Department of Computer Science, ETH Zurich", "aff_domain": "inf.ethz.ch; ; ;inf.ethz.ch; ;inf.ethz.ch", "email": "inf.ethz.ch; ; ;inf.ethz.ch; ;inf.ethz.ch", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/weber22a.html", "aff_unique_index": "0;1;1;0;1;0", "aff_unique_norm": "ETH Zurich;University of Illinois Urbana-Champaign", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.ethz.ch;https://illinois.edu", "aff_unique_abbr": "ETHZ;UIUC", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;1;1;0;1;0", "aff_country_unique": "Switzerland;United States" }, { "title": "Channel Importance Matters in Few-Shot Image Classification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18347", "id": "18347", "proceeding": "https://proceedings.mlr.press/v162/luo22c.html", "poster": "", "slides": "", "author_site": "Xu Luo, Jing Xu, ZENGLIN Xu", "author": "Xu Luo; Jing Xu; Zenglin Xu", "abstract": "Few-Shot Learning (FSL) requires vision models to quickly adapt to brand-new classification tasks with a shift in task distribution. Understanding the difficulties posed by this task distribution shift is central to FSL. In this paper, we show that a simple channel-wise feature transformation may be the key to unraveling this secret from a channel perspective. When facing novel few-shot tasks in the test-time datasets, this transformation can greatly improve the generalization ability of learned image representations, while being agnostic to the choice of datasets and training algorithms. Through an in-depth analysis of this transformation, we find that the difficulty of representation transfer in FSL stems from the severe channel bias problem of image representations: channels may have different importance in different tasks, while convolutional neural networks are likely to be insensitive, or respond incorrectly to such a shift. This points out a core problem of the generalization ability of modern vision systems which needs further attention in the future.", "bibtex": "@InProceedings{pmlr-v162-luo22c,\n title = \t {Channel Importance Matters in Few-Shot Image Classification},\n author = {Luo, Xu and Xu, Jing and Xu, Zenglin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14542--14559},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/luo22c/luo22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/luo22c.html},\n abstract = \t {Few-Shot Learning (FSL) requires vision models to quickly adapt to brand-new classification tasks with a shift in task distribution. Understanding the difficulties posed by this task distribution shift is central to FSL. In this paper, we show that a simple channel-wise feature transformation may be the key to unraveling this secret from a channel perspective. When facing novel few-shot tasks in the test-time datasets, this transformation can greatly improve the generalization ability of learned image representations, while being agnostic to the choice of datasets and training algorithms. Through an in-depth analysis of this transformation, we find that the difficulty of representation transfer in FSL stems from the severe channel bias problem of image representations: channels may have different importance in different tasks, while convolutional neural networks are likely to be insensitive, or respond incorrectly to such a shift. This points out a core problem of the generalization ability of modern vision systems which needs further attention in the future.}\n}", "pdf": "https://proceedings.mlr.press/v162/luo22c/luo22c.pdf", "supp": "", "pdf_size": 1316010, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11800681644277658610&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of Electronic Science and Technology of China; Harbin Institute of Technology Shenzhen; Harbin Institute of Technology Shenzhen+Pengcheng Laboratory", "aff_domain": "outlook.com; ;hit.edu.cn", "email": "outlook.com; ;hit.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/luo22c.html", "aff_unique_index": "0;1;1+2", "aff_unique_norm": "University of Electronic Science and Technology of China;Harbin Institute of Technology;Pengcheng Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uestc.edu.cn;https://www.hit.edu.cn/;", "aff_unique_abbr": "UESTC;HIT;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;0;0+0", "aff_country_unique": "China" }, { "title": "Characterizing and Overcoming the Greedy Nature of Learning in Multi-modal Deep Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16703", "id": "16703", "proceeding": "https://proceedings.mlr.press/v162/wu22d.html", "poster": "/media/PosterPDFs/ICML%202022/1feb4cdda5aafe2a48cbe27544cd8e4b_M6qqeNk.png?t=1657768691.3012106", "slides": "/media/icml-2022/Slides/16703.pdf", "author_site": "Nan Wu, Stanislaw Jastrzebski, Kyunghyun Cho, Krzysztof J Geras", "author": "Nan Wu; Stanislaw Jastrzebski; Kyunghyun Cho; Krzysztof J Geras", "abstract": "We hypothesize that due to the greedy nature of learning in multi-modal deep neural networks, these models tend to rely on just one modality while under-fitting the other modalities. Such behavior is counter-intuitive and hurts the models\u2019 generalization, as we observe empirically. To estimate the model\u2019s dependence on each modality, we compute the gain on the accuracy when the model has access to it in addition to another modality. We refer to this gain as the conditional utilization rate. In the experiments, we consistently observe an imbalance in conditional utilization rates between modalities, across multiple tasks and architectures. Since conditional utilization rate cannot be computed efficiently during training, we introduce a proxy for it based on the pace at which the model learns from each modality, which we refer to as the conditional learning speed. We propose an algorithm to balance the conditional learning speeds between modalities during training and demonstrate that it indeed addresses the issue of greedy learning. The proposed algorithm improves the model\u2019s generalization on three datasets: Colored MNIST, ModelNet40, and NVIDIA Dynamic Hand Gesture.", "bibtex": "@InProceedings{pmlr-v162-wu22d,\n title = \t {Characterizing and Overcoming the Greedy Nature of Learning in Multi-modal Deep Neural Networks},\n author = {Wu, Nan and Jastrzebski, Stanislaw and Cho, Kyunghyun and Geras, Krzysztof J},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24043--24055},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22d/wu22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22d.html},\n abstract = \t {We hypothesize that due to the greedy nature of learning in multi-modal deep neural networks, these models tend to rely on just one modality while under-fitting the other modalities. Such behavior is counter-intuitive and hurts the models\u2019 generalization, as we observe empirically. To estimate the model\u2019s dependence on each modality, we compute the gain on the accuracy when the model has access to it in addition to another modality. We refer to this gain as the conditional utilization rate. In the experiments, we consistently observe an imbalance in conditional utilization rates between modalities, across multiple tasks and architectures. Since conditional utilization rate cannot be computed efficiently during training, we introduce a proxy for it based on the pace at which the model learns from each modality, which we refer to as the conditional learning speed. We propose an algorithm to balance the conditional learning speeds between modalities during training and demonstrate that it indeed addresses the issue of greedy learning. The proposed algorithm improves the model\u2019s generalization on three datasets: Colored MNIST, ModelNet40, and NVIDIA Dynamic Hand Gesture.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22d/wu22d.pdf", "supp": "", "pdf_size": 1743882, "gs_citation": 123, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12235200636315362810&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Center for Data Science, New York University + Courant Institute of Mathematical Sciences, New York University + Genentech + CIFAR LMB; NYU Grossman School of Medicine; Center for Data Science, New York University + Courant Institute of Mathematical Sciences, New York University + Genentech + CIFAR LMB; NYU Grossman School of Medicine + Center for Data Science, New York University + Courant Institute of Mathematical Sciences, New York University", "aff_domain": "nyu.edu; ; ; ", "email": "nyu.edu; ; ; ", "github": "https://github.com/nyukat/greedy_multimodal_learning", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/wu22d.html", "aff_unique_index": "0+0+1+2;3;0+0+1+2;3+0+0", "aff_unique_norm": "New York University;Genentech;CIFAR;New York University Grossman School of Medicine", "aff_unique_dep": "Center for Data Science;;LMB;School of Medicine", "aff_unique_url": "https://www.nyu.edu;https://www.genentech.com;https://www.cifar.ca;https://med.nyu.edu", "aff_unique_abbr": "NYU;Genentech;CIFAR;NYU Grossman SOM", "aff_campus_unique_index": "0+0;0;0+0;0+0+0", "aff_campus_unique": "New York;", "aff_country_unique_index": "0+0+0+1;0;0+0+0+1;0+0+0", "aff_country_unique": "United States;Canada" }, { "title": "Choosing Answers in Epsilon-Best-Answer Identification for Linear Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16157", "id": "16157", "proceeding": "https://proceedings.mlr.press/v162/jourdan22a.html", "poster": "/media/PosterPDFs/ICML%202022/9cf81d8026a9018052c429cc4e56739b_2COdp7l.png?t=1655466696.6692815", "slides": "/media/icml-2022/Slides/16157.pdf", "author_site": "Marc Jourdan, R\u00e9my Degenne", "author": "Marc Jourdan; R\u00e9my Degenne", "abstract": "In pure-exploration problems, information is gathered sequentially to answer a question on the stochastic environment. While best-arm identification for linear bandits has been extensively studied in recent years, few works have been dedicated to identifying one arm that is $\\varepsilon$-close to the best one (and not exactly the best one). In this problem with several correct answers, an identification algorithm should focus on one candidate among those answers and verify that it is correct. We demonstrate that picking the answer with highest mean does not allow an algorithm to reach asymptotic optimality in terms of expected sample complexity. Instead, a", "bibtex": "@InProceedings{pmlr-v162-jourdan22a,\n title = \t {Choosing Answers in Epsilon-Best-Answer Identification for Linear Bandits},\n author = {Jourdan, Marc and Degenne, R{\\'e}my},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10384--10430},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jourdan22a/jourdan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jourdan22a.html},\n abstract = \t {In pure-exploration problems, information is gathered sequentially to answer a question on the stochastic environment. While best-arm identification for linear bandits has been extensively studied in recent years, few works have been dedicated to identifying one arm that is $\\varepsilon$-close to the best one (and not exactly the best one). In this problem with several correct answers, an identification algorithm should focus on one candidate among those answers and verify that it is correct. We demonstrate that picking the answer with highest mean does not allow an algorithm to reach asymptotic optimality in terms of expected sample complexity. Instead, a", "pdf": "https://proceedings.mlr.press/v162/jourdan22a/jourdan22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/jourdan22a-supp.zip", "pdf_size": 1207508, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2210760292654805053&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Univ. Lille, CNRS, Inria, Centrale Lille, UMR 9198-CRIStAL, F-59000 Lille, France; Univ. Lille, CNRS, Inria, Centrale Lille, UMR 9198-CRIStAL, F-59000 Lille, France", "aff_domain": "inria.fr; ", "email": "inria.fr; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/jourdan22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Lille", "aff_unique_dep": "UMR 9198-CRIStAL", "aff_unique_url": "https://www.univ-lille.fr", "aff_unique_abbr": "Univ. Lille", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lille", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Class-Imbalanced Semi-Supervised Learning with Adaptive Thresholding", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16069", "id": "16069", "proceeding": "https://proceedings.mlr.press/v162/guo22e.html", "poster": "", "slides": "/media/icml-2022/Slides/16069.pdf", "author_site": "Lan-Zhe Guo, Yu-Feng Li", "author": "Lan-Zhe Guo; Yu-Feng Li", "abstract": "Semi-supervised learning (SSL) has proven to be successful in overcoming labeling difficulties by leveraging unlabeled data. Previous SSL algorithms typically assume a balanced class distribution. However, real-world datasets are usually class-imbalanced, causing the performance of existing SSL algorithms to be seriously decreased. One essential reason is that pseudo-labels for unlabeled data are selected based on a fixed confidence threshold, resulting in low performance on minority classes. In this paper, we develop a simple yet effective framework, which only involves adaptive thresholding for different classes in SSL algorithms, and achieves remarkable performance improvement on more than twenty imbalance ratios. Specifically, we explicitly optimize the number of pseudo-labels for each class in the SSL objective, so as to simultaneously obtain adaptive thresholds and minimize empirical risk. Moreover, the determination of the adaptive threshold can be efficiently obtained by a closed-form solution. Extensive experimental results demonstrate the effectiveness of our proposed algorithms.", "bibtex": "@InProceedings{pmlr-v162-guo22e,\n title = \t {Class-Imbalanced Semi-Supervised Learning with Adaptive Thresholding},\n author = {Guo, Lan-Zhe and Li, Yu-Feng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8082--8094},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guo22e/guo22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/guo22e.html},\n abstract = \t {Semi-supervised learning (SSL) has proven to be successful in overcoming labeling difficulties by leveraging unlabeled data. Previous SSL algorithms typically assume a balanced class distribution. However, real-world datasets are usually class-imbalanced, causing the performance of existing SSL algorithms to be seriously decreased. One essential reason is that pseudo-labels for unlabeled data are selected based on a fixed confidence threshold, resulting in low performance on minority classes. In this paper, we develop a simple yet effective framework, which only involves adaptive thresholding for different classes in SSL algorithms, and achieves remarkable performance improvement on more than twenty imbalance ratios. Specifically, we explicitly optimize the number of pseudo-labels for each class in the SSL objective, so as to simultaneously obtain adaptive thresholds and minimize empirical risk. Moreover, the determination of the adaptive threshold can be efficiently obtained by a closed-form solution. Extensive experimental results demonstrate the effectiveness of our proposed algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/guo22e/guo22e.pdf", "supp": "", "pdf_size": 3126229, "gs_citation": 120, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9133062091208965701&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing 210023, China; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing 210023, China", "aff_domain": "lamda.nju.edu.cn;lamda.nju.edu.cn", "email": "lamda.nju.edu.cn;lamda.nju.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/guo22e.html", "aff_unique_index": "0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "National Key Laboratory for Novel Software Technology", "aff_unique_url": "http://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Nanjing", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Cliff Diving: Exploring Reward Surfaces in Reinforcement Learning Environments", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16959", "id": "16959", "proceeding": "https://proceedings.mlr.press/v162/sullivan22a.html", "poster": "/media/PosterPDFs/ICML%202022/a57ecd54d4df7d999bd9c5e3b973ec75_t2WgBDW.png?t=1658293730.0227757", "slides": "", "author_site": "Ryan Sullivan, Jordan Terry, Benjamin Black, John P Dickerson", "author": "Ryan Sullivan; Jordan K Terry; Benjamin Black; John P Dickerson", "abstract": "Visualizing optimization landscapes has resulted in many fundamental insights in numeric optimization, specifically regarding novel improvements to optimization techniques. However, visualizations of the objective that reinforcement learning optimizes (the \"reward surface\") have only ever been generated for a small number of narrow contexts. This work presents reward surfaces and related visualizations of 27 of the most widely used reinforcement learning environments in Gym for the first time. We also explore reward surfaces in the policy gradient direction and show for the first time that many popular reinforcement learning environments have frequent \"cliffs\" (sudden large drops in expected reward). We demonstrate that A2C often \"dives off\" these cliffs into low reward regions of the parameter space while PPO avoids them, confirming a popular intuition for PPO\u2019s improved performance over previous methods. We additionally introduce a highly extensible library that allows researchers to easily generate these visualizations in the future. Our findings provide new intuition to explain the successes and failures of modern RL methods, and our visualizations concretely characterize several failure modes of reinforcement learning agents in novel ways.", "bibtex": "@InProceedings{pmlr-v162-sullivan22a,\n title = \t {Cliff Diving: Exploring Reward Surfaces in Reinforcement Learning Environments},\n author = {Sullivan, Ryan and Terry, Jordan K and Black, Benjamin and Dickerson, John P},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20744--20776},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sullivan22a/sullivan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sullivan22a.html},\n abstract = \t {Visualizing optimization landscapes has resulted in many fundamental insights in numeric optimization, specifically regarding novel improvements to optimization techniques. However, visualizations of the objective that reinforcement learning optimizes (the \"reward surface\") have only ever been generated for a small number of narrow contexts. This work presents reward surfaces and related visualizations of 27 of the most widely used reinforcement learning environments in Gym for the first time. We also explore reward surfaces in the policy gradient direction and show for the first time that many popular reinforcement learning environments have frequent \"cliffs\" (sudden large drops in expected reward). We demonstrate that A2C often \"dives off\" these cliffs into low reward regions of the parameter space while PPO avoids them, confirming a popular intuition for PPO\u2019s improved performance over previous methods. We additionally introduce a highly extensible library that allows researchers to easily generate these visualizations in the future. Our findings provide new intuition to explain the successes and failures of modern RL methods, and our visualizations concretely characterize several failure modes of reinforcement learning agents in novel ways.}\n}", "pdf": "https://proceedings.mlr.press/v162/sullivan22a/sullivan22a.pdf", "supp": "", "pdf_size": 37181135, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1786735970050040127&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Swarm Labs+Department of Computer Science, University of Maryland, College Park; Swarm Labs+Department of Computer Science, University of Maryland, College Park; Swarm Labs+Department of Computer Science, University of Maryland, College Park; Department of Computer Science, University of Maryland, College Park", "aff_domain": "rsulliumd.edu; ; ; ", "email": "rsulliumd.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/sullivan22a.html", "aff_unique_index": "0+1;0+1;0+1;1", "aff_unique_norm": "Swarm Labs;University of Maryland, College Park", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": ";https://www/umd.edu", "aff_unique_abbr": ";UMD", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "1;1;1;1", "aff_country_unique": ";United States" }, { "title": "Closed-Form Diffeomorphic Transformations for Time Series Alignment", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17697", "id": "17697", "proceeding": "https://proceedings.mlr.press/v162/martinez22a.html", "poster": "/media/PosterPDFs/ICML%202022/371bce7dc83817b7893bcdeed13799b5_Vdn44Un.png?t=1657091788.10969", "slides": "/media/icml-2022/Slides/17697.pdf", "author_site": "I\u00f1igo Martinez, Elisabeth Viles, Igor G. Olaizola", "author": "I\u00f1igo Martinez; Elisabeth Viles; Igor G. Olaizola", "abstract": "Time series alignment methods call for highly expressive, differentiable and invertible warping functions which preserve temporal topology, i.e diffeomorphisms. Diffeomorphic warping functions can be generated from the integration of velocity fields governed by an ordinary differential equation (ODE). Gradient-based optimization frameworks containing diffeomorphic transformations require to calculate derivatives to the differential equation\u2019s solution with respect to the model parameters, i.e. sensitivity analysis. Unfortunately, deep learning frameworks typically lack automatic-differentiation-compatible sensitivity analysis methods; and implicit functions, such as the solution of ODE, require particular care. Current solutions appeal to adjoint sensitivity methods, ad-hoc numerical solvers or ResNet\u2019s Eulerian discretization. In this work, we present a closed-form expression for the ODE solution and its gradient under continuous piecewise-affine (CPA) velocity functions. We present a highly optimized implementation of the results on CPU and GPU. Furthermore, we conduct extensive experiments on several datasets to validate the generalization ability of our model to unseen data for time-series joint alignment. Results show significant improvements both in terms of efficiency and accuracy.", "bibtex": "@InProceedings{pmlr-v162-martinez22a,\n title = \t {Closed-Form Diffeomorphic Transformations for Time Series Alignment},\n author = {Martinez, I{\\~n}igo and Viles, Elisabeth and Olaizola, Igor G.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15122--15158},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/martinez22a/martinez22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/martinez22a.html},\n abstract = \t {Time series alignment methods call for highly expressive, differentiable and invertible warping functions which preserve temporal topology, i.e diffeomorphisms. Diffeomorphic warping functions can be generated from the integration of velocity fields governed by an ordinary differential equation (ODE). Gradient-based optimization frameworks containing diffeomorphic transformations require to calculate derivatives to the differential equation\u2019s solution with respect to the model parameters, i.e. sensitivity analysis. Unfortunately, deep learning frameworks typically lack automatic-differentiation-compatible sensitivity analysis methods; and implicit functions, such as the solution of ODE, require particular care. Current solutions appeal to adjoint sensitivity methods, ad-hoc numerical solvers or ResNet\u2019s Eulerian discretization. In this work, we present a closed-form expression for the ODE solution and its gradient under continuous piecewise-affine (CPA) velocity functions. We present a highly optimized implementation of the results on CPU and GPU. Furthermore, we conduct extensive experiments on several datasets to validate the generalization ability of our model to unseen data for time-series joint alignment. Results show significant improvements both in terms of efficiency and accuracy.}\n}", "pdf": "https://proceedings.mlr.press/v162/martinez22a/martinez22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/martinez22a-supp.zip", "pdf_size": 2971021, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15344236423757479416&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Vicomtech Foundation, Basque Research and Technology Alliance (BRTA), San Sebastian, Spain; TECNUN School of Engineering, University of Navarra, San Sebastian, Spain+Institute of Data Science and Artificial Intelligence, University of Navarra, Pamplona, Spain; Vicomtech Foundation, Basque Research and Technology Alliance (BRTA), San Sebastian, Spain", "aff_domain": "vicomtech.org; ; ", "email": "vicomtech.org; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/martinez22a.html", "aff_unique_index": "0;1+1;0", "aff_unique_norm": "Vicomtech Foundation;University of Navarra", "aff_unique_dep": ";School of Engineering", "aff_unique_url": ";https://www.unav.edu", "aff_unique_abbr": ";UNAV", "aff_campus_unique_index": "0;0+1;0", "aff_campus_unique": "San Sebastian;Pamplona", "aff_country_unique_index": "0;0+0;0", "aff_country_unique": "Spain" }, { "title": "Co-training Improves Prompt-based Learning for Large Language Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17881", "id": "17881", "proceeding": "https://proceedings.mlr.press/v162/lang22a.html", "poster": "/media/PosterPDFs/ICML%202022/894b77f805bd94d292574c38c5d628d5.png?t=1657916218.2764318", "slides": "", "author_site": "Hunter Lang, Monica Agrawal, Yoon Kim, David Sontag", "author": "Hunter Lang; Monica N Agrawal; Yoon Kim; David Sontag", "abstract": "We demonstrate that co-training (Blum & Mitchell, 1998) can improve the performance of prompt-based learning by using unlabeled data. While prompting has emerged as a promising paradigm for few-shot and zero-shot learning, it is often brittle and requires much larger models compared to the standard supervised setup. We find that co-training makes it possible to improve the original prompt model and at the same time learn a smaller, downstream task-specific model. In the case where we only have partial access to a prompt model (e.g., output probabilities from GPT-3 (Brown et al., 2020)) we learn a calibration model over the prompt outputs. When we have full access to the prompt model\u2019s gradients but full finetuning remains prohibitively expensive (e.g., T0 (Sanh et al., 2021)), we learn a set of soft prompt continuous vectors to iteratively update the prompt model. We find that models trained in this manner can significantly improve performance on challenging datasets where there is currently a large gap between prompt-based learning and fully-supervised models.", "bibtex": "@InProceedings{pmlr-v162-lang22a,\n title = \t {Co-training Improves Prompt-based Learning for Large Language Models},\n author = {Lang, Hunter and Agrawal, Monica N and Kim, Yoon and Sontag, David},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11985--12003},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lang22a/lang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lang22a.html},\n abstract = \t {We demonstrate that co-training (Blum & Mitchell, 1998) can improve the performance of prompt-based learning by using unlabeled data. While prompting has emerged as a promising paradigm for few-shot and zero-shot learning, it is often brittle and requires much larger models compared to the standard supervised setup. We find that co-training makes it possible to improve the original prompt model and at the same time learn a smaller, downstream task-specific model. In the case where we only have partial access to a prompt model (e.g., output probabilities from GPT-3 (Brown et al., 2020)) we learn a calibration model over the prompt outputs. When we have full access to the prompt model\u2019s gradients but full finetuning remains prohibitively expensive (e.g., T0 (Sanh et al., 2021)), we learn a set of soft prompt continuous vectors to iteratively update the prompt model. We find that models trained in this manner can significantly improve performance on challenging datasets where there is currently a large gap between prompt-based learning and fully-supervised models.}\n}", "pdf": "https://proceedings.mlr.press/v162/lang22a/lang22a.pdf", "supp": "", "pdf_size": 727094, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10631065967851422629&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "MIT CSAIL; MIT CSAIL; MIT CSAIL; MIT CSAIL", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lang22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.csail.mit.edu", "aff_unique_abbr": "MIT CSAIL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Coarsening the Granularity: Towards Structurally Sparse Lottery Tickets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16041", "id": "16041", "proceeding": "https://proceedings.mlr.press/v162/chen22a.html", "poster": "/media/PosterPDFs/ICML%202022/22f7e834551fbb0f6ea55b04889e8eb1.png?t=1657516256.1040432", "slides": "", "author_site": "Tianlong Chen, Xuxi Chen, Xiaolong Ma, Yanzhi Wang, Zhangyang \u201cAtlas\u201d Wang", "author": "Tianlong Chen; Xuxi Chen; Xiaolong Ma; Yanzhi Wang; Zhangyang Wang", "abstract": "The lottery ticket hypothesis (LTH) has shown that dense models contain highly sparse subnetworks (i.e., winning tickets) that can be trained in isolation to match full accuracy. Despite many exciting efforts being made, there is one \"commonsense\" rarely challenged: a winning ticket is found by iterative magnitude pruning (IMP) and hence the resultant pruned subnetworks have only unstructured sparsity. That gap limits the appeal of winning tickets in practice, since the highly irregular sparse patterns are challenging to accelerate on hardware. Meanwhile, directly substituting structured pruning for unstructured pruning in IMP damages performance more severely and is usually unable to locate winning tickets. In this paper, we demonstrate the first positive result that a structurally sparse winning ticket can be effectively found in general. The core idea is to append \"post-processing techniques\" after each round of (unstructured) IMP, to enforce the formation of structural sparsity. Specifically, we first \"re-fill\" pruned elements back in some channels deemed to be important, and then \"re-group\" non-zero elements to create flexible group-wise structural patterns. Both our identified channel- and group-wise structural subnetworks win the lottery, with substantial inference speedups readily supported by existing hardware. Extensive experiments, conducted on diverse datasets across multiple network backbones, consistently validate our proposal, showing that the hardware acceleration roadblock of LTH is now removed. Specifically, the structural winning tickets obtain up to {64.93%, 64.84%, 60.23%} running time savings at {36%\u00a080%, 74%, 58%} sparsity on {CIFAR, Tiny-ImageNet, ImageNet}, while maintaining comparable accuracy. Code is at https://github.com/VITA-Group/Structure-LTH.", "bibtex": "@InProceedings{pmlr-v162-chen22a,\n title = \t {Coarsening the Granularity: Towards Structurally Sparse Lottery Tickets},\n author = {Chen, Tianlong and Chen, Xuxi and Ma, Xiaolong and Wang, Yanzhi and Wang, Zhangyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3025--3039},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22a/chen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22a.html},\n abstract = \t {The lottery ticket hypothesis (LTH) has shown that dense models contain highly sparse subnetworks (i.e., winning tickets) that can be trained in isolation to match full accuracy. Despite many exciting efforts being made, there is one \"commonsense\" rarely challenged: a winning ticket is found by iterative magnitude pruning (IMP) and hence the resultant pruned subnetworks have only unstructured sparsity. That gap limits the appeal of winning tickets in practice, since the highly irregular sparse patterns are challenging to accelerate on hardware. Meanwhile, directly substituting structured pruning for unstructured pruning in IMP damages performance more severely and is usually unable to locate winning tickets. In this paper, we demonstrate the first positive result that a structurally sparse winning ticket can be effectively found in general. The core idea is to append \"post-processing techniques\" after each round of (unstructured) IMP, to enforce the formation of structural sparsity. Specifically, we first \"re-fill\" pruned elements back in some channels deemed to be important, and then \"re-group\" non-zero elements to create flexible group-wise structural patterns. Both our identified channel- and group-wise structural subnetworks win the lottery, with substantial inference speedups readily supported by existing hardware. Extensive experiments, conducted on diverse datasets across multiple network backbones, consistently validate our proposal, showing that the hardware acceleration roadblock of LTH is now removed. Specifically, the structural winning tickets obtain up to {64.93%, 64.84%, 60.23%} running time savings at {36%\u00a080%, 74%, 58%} sparsity on {CIFAR, Tiny-ImageNet, ImageNet}, while maintaining comparable accuracy. Code is at https://github.com/VITA-Group/Structure-LTH.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22a/chen22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chen22a-supp.zip", "pdf_size": 1808619, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11130219439194607083&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of Texas at Austin; University of Texas at Austin; Northeastern University; Northeastern University; University of Texas at Austin", "aff_domain": "utexas.edu; ; ; ;utexas.edu", "email": "utexas.edu; ; ; ;utexas.edu", "github": "https://github.com/VITA-Group/Structure-LTH", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/chen22a.html", "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "University of Texas at Austin;Northeastern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.northeastern.edu", "aff_unique_abbr": "UT Austin;NEU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Coin Flipping Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18201", "id": "18201", "proceeding": "https://proceedings.mlr.press/v162/sieradzki22a.html", "poster": "/media/PosterPDFs/ICML%202022/d51b416788b6ee70eb0c381c06efc9f1.png?t=1657486265.5906732", "slides": "", "author_site": "Yuval Sieradzki, Nitzan Hodos, Gal Yehuda, Assaf Schuster", "author": "Yuval Sieradzki; Nitzan Hodos; Gal Yehuda; Assaf Schuster", "abstract": "We show that neural networks with access to randomness can outperform deterministic networks by using amplification. We call such networks Coin-Flipping Neural Networks, or CFNNs. We show that a CFNN can approximate the indicator of a d-dimensional ball to arbitrary accuracy with only 2 layers and O(1) neurons, where a 2-layer deterministic network was shown to require Omega(e^d) neurons, an exponential improvement. We prove a highly non-trivial result, that for almost any classification problem, there exists a trivially simple network that solves it given a sufficiently powerful generator for the network\u2019s weights. Combining these results we conjecture that for most classification problems, there is a CFNN which solves them with higher accuracy or fewer neurons than any deterministic network. Finally, we verify our proofs experimentally using novel CFNN architectures on CIFAR10 and CIFAR100, reaching an improvement of 9.25% from the baseline.", "bibtex": "@InProceedings{pmlr-v162-sieradzki22a,\n title = \t {Coin Flipping Neural Networks},\n author = {Sieradzki, Yuval and Hodos, Nitzan and Yehuda, Gal and Schuster, Assaf},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20195--20214},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sieradzki22a/sieradzki22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sieradzki22a.html},\n abstract = \t {We show that neural networks with access to randomness can outperform deterministic networks by using amplification. We call such networks Coin-Flipping Neural Networks, or CFNNs. We show that a CFNN can approximate the indicator of a d-dimensional ball to arbitrary accuracy with only 2 layers and O(1) neurons, where a 2-layer deterministic network was shown to require Omega(e^d) neurons, an exponential improvement. We prove a highly non-trivial result, that for almost any classification problem, there exists a trivially simple network that solves it given a sufficiently powerful generator for the network\u2019s weights. Combining these results we conjecture that for most classification problems, there is a CFNN which solves them with higher accuracy or fewer neurons than any deterministic network. Finally, we verify our proofs experimentally using novel CFNN architectures on CIFAR10 and CIFAR100, reaching an improvement of 9.25% from the baseline.}\n}", "pdf": "https://proceedings.mlr.press/v162/sieradzki22a/sieradzki22a.pdf", "supp": "", "pdf_size": 2673661, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11169926247282494743&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, Technion - Israel Institute of Technology, Haifa, Israel; Department of Computer Science, Technion - Israel Institute of Technology, Haifa, Israel; Department of Computer Science, Technion - Israel Institute of Technology, Haifa, Israel; Department of Computer Science, Technion - Israel Institute of Technology, Haifa, Israel", "aff_domain": "campus.technion.ac.il;campus.technion.ac.il; ;technion.ac.il", "email": "campus.technion.ac.il;campus.technion.ac.il; ;technion.ac.il", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/sieradzki22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.technion.ac.il", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Haifa", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Israel" }, { "title": "Collaboration of Experts: Achieving 80% Top-1 Accuracy on ImageNet with 100M FLOPs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17451", "id": "17451", "proceeding": "https://proceedings.mlr.press/v162/zhang22c.html", "poster": "/media/PosterPDFs/ICML%202022/74934548253bcab8490ebd74afed7031.png?t=1657279833.3058636", "slides": "", "author_site": "Yikang Zhang, zhuo chen, Zhao Zhong", "author": "Yikang Zhang; Zhuo Chen; Zhao Zhong", "abstract": "In this paper, we propose a Collaboration of Experts (CoE) framework to assemble the expertise of multiple networks towards a common goal. Each expert is an individual network with expertise on a unique portion of the dataset, contributing to the collective capacity. Given a sample, delegator selects an expert and simultaneously outputs a rough prediction to trigger potential early termination. For each model in CoE, we propose a novel training algorithm with two major components: weight generation module (WGM) and label generation module (LGM). It fulfills the co-adaptation of experts and delegator. WGM partitions the training data into portions based on delegator via solving a balanced transportation problem, then impels each expert to focus on one portion by reweighting the losses. LGM generates the label to constitute the loss of delegator for expert selection. CoE achieves the state-of-the-art performance on ImageNet, 80.7% top-1 accuracy with 194M FLOPs. Combined with PWLU and CondConv, CoE further boosts the accuracy to 80.0% with only 100M FLOPs for the first time. Furthermore, experiment results on the translation task also demonstrate the strong generalizability of CoE. CoE is hardware-friendly, yielding a 3\u00a06x acceleration compared with existing conditional computation approaches.", "bibtex": "@InProceedings{pmlr-v162-zhang22c,\n title = \t {Collaboration of Experts: Achieving 80% Top-1 Accuracy on {I}mage{N}et with 100{M} {FLOP}s},\n author = {Zhang, Yikang and Chen, Zhuo and Zhong, Zhao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26068--26084},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22c/zhang22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22c.html},\n abstract = \t {In this paper, we propose a Collaboration of Experts (CoE) framework to assemble the expertise of multiple networks towards a common goal. Each expert is an individual network with expertise on a unique portion of the dataset, contributing to the collective capacity. Given a sample, delegator selects an expert and simultaneously outputs a rough prediction to trigger potential early termination. For each model in CoE, we propose a novel training algorithm with two major components: weight generation module (WGM) and label generation module (LGM). It fulfills the co-adaptation of experts and delegator. WGM partitions the training data into portions based on delegator via solving a balanced transportation problem, then impels each expert to focus on one portion by reweighting the losses. LGM generates the label to constitute the loss of delegator for expert selection. CoE achieves the state-of-the-art performance on ImageNet, 80.7% top-1 accuracy with 194M FLOPs. Combined with PWLU and CondConv, CoE further boosts the accuracy to 80.0% with only 100M FLOPs for the first time. Furthermore, experiment results on the translation task also demonstrate the strong generalizability of CoE. CoE is hardware-friendly, yielding a 3\u00a06x acceleration compared with existing conditional computation approaches.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22c/zhang22c.pdf", "supp": "", "pdf_size": 8885793, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3356202487479523066&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Huawei, Beijing, China; Huawei, Beijing, China; Huawei, Beijing, China", "aff_domain": "huawei.com; ; ", "email": "huawei.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhang22c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Huawei", "aff_unique_dep": "Huawei", "aff_unique_url": "https://www.huawei.com", "aff_unique_abbr": "Huawei", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Combining Diverse Feature Priors", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16523", "id": "16523", "proceeding": "https://proceedings.mlr.press/v162/jain22b.html", "poster": "/media/PosterPDFs/ICML%202022/0234c510bc6d908b28c70ff313743079.png?t=1657635139.1467679", "slides": "", "author_site": "Saachi Jain, Dimitris Tsipras, Aleksander Madry", "author": "Saachi Jain; Dimitris Tsipras; Aleksander Madry", "abstract": "To improve model generalization, model designers often restrict the features that their models use, either implicitly or explicitly. In this work, we explore the design space of leveraging such feature priors by viewing them as distinct perspectives on the data. Specifically, we find that models trained with diverse sets of explicit feature priors have less overlapping failure modes, and can thus be combined more effectively. Moreover, we demonstrate that jointly training such models on additional (unlabeled) data allows them to correct each other\u2019s mistakes, which, in turn, leads to better generalization and resilience to spurious correlations.", "bibtex": "@InProceedings{pmlr-v162-jain22b,\n title = \t {Combining Diverse Feature Priors},\n author = {Jain, Saachi and Tsipras, Dimitris and Madry, Aleksander},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9802--9832},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jain22b/jain22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/jain22b.html},\n abstract = \t {To improve model generalization, model designers often restrict the features that their models use, either implicitly or explicitly. In this work, we explore the design space of leveraging such feature priors by viewing them as distinct perspectives on the data. Specifically, we find that models trained with diverse sets of explicit feature priors have less overlapping failure modes, and can thus be combined more effectively. Moreover, we demonstrate that jointly training such models on additional (unlabeled) data allows them to correct each other\u2019s mistakes, which, in turn, leads to better generalization and resilience to spurious correlations.}\n}", "pdf": "https://proceedings.mlr.press/v162/jain22b/jain22b.pdf", "supp": "", "pdf_size": 3733726, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3431368394631636693&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "MIT; MIT; MIT", "aff_domain": "mit.edu;mit.edu; ", "email": "mit.edu;mit.edu; ", "github": "https://github.com/MadryLab/copriors", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/jain22b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Communicating via Markov Decision Processes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16975", "id": "16975", "proceeding": "https://proceedings.mlr.press/v162/sokota22a.html", "poster": "/media/PosterPDFs/ICML%202022/748ba69d3e8d1af87f84fee909eef339.png?t=1657573187.1597419", "slides": "", "author_site": "Samuel Sokota, Christian Schroeder, Maximilian Igl, Luisa Zintgraf, Phil Torr, Martin Strohmeier, Zico Kolter, Shimon Whiteson, Jakob Foerster", "author": "Samuel Sokota; Christian A Schroeder De Witt; Maximilian Igl; Luisa M Zintgraf; Philip Torr; Martin Strohmeier; Zico Kolter; Shimon Whiteson; Jakob Foerster", "abstract": "We consider the problem of communicating exogenous information by means of Markov decision process trajectories. This setting, which we call a Markov coding game (MCG), generalizes both source coding and a large class of referential games. MCGs also isolate a problem that is important in decentralized control settings in which cheap-talk is not available\u2014namely, they require balancing communication with the associated cost of communicating. We contribute a theoretically grounded approach to MCGs based on maximum entropy reinforcement learning and minimum entropy coupling that we call MEME. Due to recent breakthroughs in approximation algorithms for minimum entropy coupling, MEME is not merely a theoretical algorithm, but can be applied to practical settings. Empirically, we show both that MEME is able to outperform a strong baseline on small MCGs and that MEME is able to achieve strong performance on extremely large MCGs. To the latter point, we demonstrate that MEME is able to losslessly communicate binary images via trajectories of Cartpole and Pong, while simultaneously achieving the maximal or near maximal expected returns, and that it is even capable of performing well in the presence of actuator noise.", "bibtex": "@InProceedings{pmlr-v162-sokota22a,\n title = \t {Communicating via {M}arkov Decision Processes},\n author = {Sokota, Samuel and De Witt, Christian A Schroeder and Igl, Maximilian and Zintgraf, Luisa M and Torr, Philip and Strohmeier, Martin and Kolter, Zico and Whiteson, Shimon and Foerster, Jakob},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20314--20328},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sokota22a/sokota22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sokota22a.html},\n abstract = \t {We consider the problem of communicating exogenous information by means of Markov decision process trajectories. This setting, which we call a Markov coding game (MCG), generalizes both source coding and a large class of referential games. MCGs also isolate a problem that is important in decentralized control settings in which cheap-talk is not available\u2014namely, they require balancing communication with the associated cost of communicating. We contribute a theoretically grounded approach to MCGs based on maximum entropy reinforcement learning and minimum entropy coupling that we call MEME. Due to recent breakthroughs in approximation algorithms for minimum entropy coupling, MEME is not merely a theoretical algorithm, but can be applied to practical settings. Empirically, we show both that MEME is able to outperform a strong baseline on small MCGs and that MEME is able to achieve strong performance on extremely large MCGs. To the latter point, we demonstrate that MEME is able to losslessly communicate binary images via trajectories of Cartpole and Pong, while simultaneously achieving the maximal or near maximal expected returns, and that it is even capable of performing well in the presence of actuator noise.}\n}", "pdf": "https://proceedings.mlr.press/v162/sokota22a/sokota22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/sokota22a-supp.zip", "pdf_size": 1609679, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1909863582927997201&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "github": "", "project": "", "author_num": 9, "oa": "https://proceedings.mlr.press/v162/sokota22a.html" }, { "title": "Communication-Efficient Adaptive Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18273", "id": "18273", "proceeding": "https://proceedings.mlr.press/v162/wang22o.html", "poster": "/media/PosterPDFs/ICML%202022/83715fd4755b33f9c3958e1a9ee221e1_TDlwisD.png?t=1657593241.2853951", "slides": "", "author_site": "Yujia Wang, Lu Lin, Jinghui Chen", "author": "Yujia Wang; Lu Lin; Jinghui Chen", "abstract": "Federated learning is a machine learning training paradigm that enables clients to jointly train models without sharing their own localized data. However, the implementation of federated learning in practice still faces numerous challenges, such as the large communication overhead due to the repetitive server-client synchronization and the lack of adaptivity by SGD-based model updates. Despite that various methods have been proposed for reducing the communication cost by gradient compression or quantization, and the federated versions of adaptive optimizers such as FedAdam are proposed to add more adaptivity, the current federated learning framework still cannot solve the aforementioned challenges all at once. In this paper, we propose a novel communication-efficient adaptive federated learning method (FedCAMS) with theoretical convergence guarantees. We show that in the nonconvex stochastic optimization setting, our proposed FedCAMS achieves the same convergence rate of $O(\\frac{1}{\\sqrt{TKm}})$ as its non-compressed counterparts. Extensive experiments on various benchmarks verify our theoretical analysis.", "bibtex": "@InProceedings{pmlr-v162-wang22o,\n title = \t {Communication-Efficient Adaptive Federated Learning},\n author = {Wang, Yujia and Lin, Lu and Chen, Jinghui},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22802--22838},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22o/wang22o.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22o.html},\n abstract = \t {Federated learning is a machine learning training paradigm that enables clients to jointly train models without sharing their own localized data. However, the implementation of federated learning in practice still faces numerous challenges, such as the large communication overhead due to the repetitive server-client synchronization and the lack of adaptivity by SGD-based model updates. Despite that various methods have been proposed for reducing the communication cost by gradient compression or quantization, and the federated versions of adaptive optimizers such as FedAdam are proposed to add more adaptivity, the current federated learning framework still cannot solve the aforementioned challenges all at once. In this paper, we propose a novel communication-efficient adaptive federated learning method (FedCAMS) with theoretical convergence guarantees. We show that in the nonconvex stochastic optimization setting, our proposed FedCAMS achieves the same convergence rate of $O(\\frac{1}{\\sqrt{TKm}})$ as its non-compressed counterparts. Extensive experiments on various benchmarks verify our theoretical analysis.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22o/wang22o.pdf", "supp": "", "pdf_size": 1184286, "gs_citation": 120, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5602954900920350565&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "College of Information Sciences and Technology, Pennsylvania State University, State College, PA, United States; Department of Computer Science, University of Virginia, Charlottesville, VA, United States; College of Information Sciences and Technology, Pennsylvania State University, State College, PA, United States", "aff_domain": "psu.edu; ;psu.edu", "email": "psu.edu; ;psu.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22o.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Pennsylvania State University;University of Virginia", "aff_unique_dep": "College of Information Sciences and Technology;Department of Computer Science", "aff_unique_url": "https://www.psu.edu;https://www.virginia.edu", "aff_unique_abbr": "PSU;UVA", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "State College;Charlottesville", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Communication-efficient Distributed Learning for Large Batch Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17001", "id": "17001", "proceeding": "https://proceedings.mlr.press/v162/liu22n.html", "poster": "", "slides": "", "author_site": "Rui Liu, Barzan Mozafari", "author": "Rui Liu; Barzan Mozafari", "abstract": "Many communication-efficient methods have been proposed for distributed learning, whereby gradient compression is used to reduce the communication cost. However, given recent advances in large batch optimization (e.g., large batch SGD and its variant LARS with layerwise adaptive learning rates), the compute power of each machine is being fully utilized. This means, in modern distributed learning, the per-machine computation cost is no longer negligible compared to the communication cost. In this paper, we propose new gradient compression methods for large batch optimization, JointSpar and its variant JointSpar-LARS with layerwise adaptive learning rates, that jointly reduce both the computation and the communication cost. To achieve this, we take advantage of the redundancy in the gradient computation, unlike the existing methods compute all coordinates of the gradient vector, even if some coordinates are later dropped for communication efficiency. JointSpar and its variant further reduce the training time by avoiding the wasted computation on dropped coordinates. While computationally more efficient, we prove that JointSpar and its variant also maintain the same convergence rates as their respective baseline methods. Extensive experiments show that, by reducing the time per iteration, our methods converge faster than state-of-the-art compression methods in terms of wall-clock time.", "bibtex": "@InProceedings{pmlr-v162-liu22n,\n title = \t {Communication-efficient Distributed Learning for Large Batch Optimization},\n author = {Liu, Rui and Mozafari, Barzan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13925--13946},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22n/liu22n.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22n.html},\n abstract = \t {Many communication-efficient methods have been proposed for distributed learning, whereby gradient compression is used to reduce the communication cost. However, given recent advances in large batch optimization (e.g., large batch SGD and its variant LARS with layerwise adaptive learning rates), the compute power of each machine is being fully utilized. This means, in modern distributed learning, the per-machine computation cost is no longer negligible compared to the communication cost. In this paper, we propose new gradient compression methods for large batch optimization, JointSpar and its variant JointSpar-LARS with layerwise adaptive learning rates, that jointly reduce both the computation and the communication cost. To achieve this, we take advantage of the redundancy in the gradient computation, unlike the existing methods compute all coordinates of the gradient vector, even if some coordinates are later dropped for communication efficiency. JointSpar and its variant further reduce the training time by avoiding the wasted computation on dropped coordinates. While computationally more efficient, we prove that JointSpar and its variant also maintain the same convergence rates as their respective baseline methods. Extensive experiments show that, by reducing the time per iteration, our methods converge faster than state-of-the-art compression methods in terms of wall-clock time.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22n/liu22n.pdf", "supp": "", "pdf_size": 685455, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5434835206629534205&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Computer Science and Engineering, University of Michigan, Ann Arbor; Computer Science and Engineering, University of Michigan, Ann Arbor", "aff_domain": "umich.edu;umich.edu", "email": "umich.edu;umich.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/liu22n.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "Computer Science and Engineering", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ann Arbor", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Composing Partial Differential Equations with Physics-Aware Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16235", "id": "16235", "proceeding": "https://proceedings.mlr.press/v162/karlbauer22a.html", "poster": "/media/PosterPDFs/ICML%202022/1b113258af3968aaf3969ca67e744ff8.png?t=1657214578.397174", "slides": "", "author_site": "Matthias Karlbauer, Timothy Praditia, Sebastian Otte, Sergey Oladyshkin, Wolfgang Nowak, Martin V Butz", "author": "Matthias Karlbauer; Timothy Praditia; Sebastian Otte; Sergey Oladyshkin; Wolfgang Nowak; Martin V. Butz", "abstract": "We introduce a compositional physics-aware FInite volume Neural Network (FINN) for learning spatiotemporal advection-diffusion processes. FINN implements a new way of combining the learning abilities of artificial neural networks with physical and structural knowledge from numerical simulation by modeling the constituents of partial differential equations (PDEs) in a compositional manner. Results on both one- and two-dimensional PDEs (Burgers\u2019, diffusion-sorption, diffusion-reaction, Allen{\u2013}Cahn) demonstrate FINN\u2019s superior modeling accuracy and excellent out-of-distribution generalization ability beyond initial and boundary conditions. With only one tenth of the number of parameters on average, FINN outperforms pure machine learning and other state-of-the-art physics-aware models in all cases{\u2014}often even by multiple orders of magnitude. Moreover, FINN outperforms a calibrated physical model when approximating sparse real-world data in a diffusion-sorption scenario, confirming its generalization abilities and showing explanatory potential by revealing the unknown retardation factor of the observed process.", "bibtex": "@InProceedings{pmlr-v162-karlbauer22a,\n title = \t {Composing Partial Differential Equations with Physics-Aware Neural Networks},\n author = {Karlbauer, Matthias and Praditia, Timothy and Otte, Sebastian and Oladyshkin, Sergey and Nowak, Wolfgang and Butz, Martin V.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10773--10801},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/karlbauer22a/karlbauer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/karlbauer22a.html},\n abstract = \t {We introduce a compositional physics-aware FInite volume Neural Network (FINN) for learning spatiotemporal advection-diffusion processes. FINN implements a new way of combining the learning abilities of artificial neural networks with physical and structural knowledge from numerical simulation by modeling the constituents of partial differential equations (PDEs) in a compositional manner. Results on both one- and two-dimensional PDEs (Burgers\u2019, diffusion-sorption, diffusion-reaction, Allen{\u2013}Cahn) demonstrate FINN\u2019s superior modeling accuracy and excellent out-of-distribution generalization ability beyond initial and boundary conditions. With only one tenth of the number of parameters on average, FINN outperforms pure machine learning and other state-of-the-art physics-aware models in all cases{\u2014}often even by multiple orders of magnitude. Moreover, FINN outperforms a calibrated physical model when approximating sparse real-world data in a diffusion-sorption scenario, confirming its generalization abilities and showing explanatory potential by revealing the unknown retardation factor of the observed process.}\n}", "pdf": "https://proceedings.mlr.press/v162/karlbauer22a/karlbauer22a.pdf", "supp": "", "pdf_size": 3040774, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5219761110162787549&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Neuro-Cognitive Modeling, University of T\u00fcbingen, T\u00fcbingen, Germany+Department of Stochastic Simulation and Safety Research for Hydrosystems, University of Stuttgart, Stuttgart, Germany; Department of Stochastic Simulation and Safety Research for Hydrosystems, University of Stuttgart, Stuttgart, Germany; Neuro-Cognitive Modeling, University of T\u00fcbingen, T\u00fcbingen, Germany; Department of Stochastic Simulation and Safety Research for Hydrosystems, University of Stuttgart, Stuttgart, Germany; Department of Stochastic Simulation and Safety Research for Hydrosystems, University of Stuttgart, Stuttgart, Germany; Neuro-Cognitive Modeling, University of T\u00fcbingen, T\u00fcbingen, Germany", "aff_domain": "uni-tuebingen.de; ; ; ; ; ", "email": "uni-tuebingen.de; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/karlbauer22a.html", "aff_unique_index": "0+1;1;0;1;1;0", "aff_unique_norm": "University of T\u00fcbingen;University of Stuttgart", "aff_unique_dep": "Neuro-Cognitive Modeling;Department of Stochastic Simulation and Safety Research for Hydrosystems", "aff_unique_url": "https://www.uni-tuebingen.de;https://www.uni-stuttgart.de", "aff_unique_abbr": ";Uni Stuttgart", "aff_campus_unique_index": "0+1;1;0;1;1;0", "aff_campus_unique": "T\u00fcbingen;Stuttgart", "aff_country_unique_index": "0+0;0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Comprehensive Analysis of Negative Sampling in Knowledge Graph Representation Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17513", "id": "17513", "proceeding": "https://proceedings.mlr.press/v162/kamigaito22a.html", "poster": "/media/PosterPDFs/ICML%202022/adf854f418fc96fb01ad92a2ed2fc35c.png?t=1658389575.061868", "slides": "", "author_site": "Hidetaka Kamigaito, Katsuhiko Hayashi", "author": "Hidetaka Kamigaito; Katsuhiko Hayashi", "abstract": "Negative sampling\u00a0(NS) loss plays an important role in learning knowledge graph embedding\u00a0(KGE) to handle a huge number of entities. However, the performance of KGE degrades without hyperparameters such as the margin term and number of negative samples in NS loss being appropriately selected. Currently, empirical hyperparameter tuning addresses this problem at the cost of computational time. To solve this problem, we theoretically analyzed NS loss to assist hyperparameter tuning and understand the better use of the NS loss in KGE learning. Our theoretical analysis showed that scoring methods with restricted value ranges, such as TransE and RotatE, require appropriate adjustment of the margin term or the number of negative samples different from those without restricted value ranges, such as RESCAL, ComplEx, and DistMult. We also propose subsampling methods specialized for the NS loss in KGE studied from a theoretical aspect. Our empirical analysis on the FB15k-237, WN18RR, and YAGO3-10 datasets showed that the results of actually trained models agree with our theoretical findings.", "bibtex": "@InProceedings{pmlr-v162-kamigaito22a,\n title = \t {Comprehensive Analysis of Negative Sampling in Knowledge Graph Representation Learning},\n author = {Kamigaito, Hidetaka and Hayashi, Katsuhiko},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10661--10675},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kamigaito22a/kamigaito22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kamigaito22a.html},\n abstract = \t {Negative sampling\u00a0(NS) loss plays an important role in learning knowledge graph embedding\u00a0(KGE) to handle a huge number of entities. However, the performance of KGE degrades without hyperparameters such as the margin term and number of negative samples in NS loss being appropriately selected. Currently, empirical hyperparameter tuning addresses this problem at the cost of computational time. To solve this problem, we theoretically analyzed NS loss to assist hyperparameter tuning and understand the better use of the NS loss in KGE learning. Our theoretical analysis showed that scoring methods with restricted value ranges, such as TransE and RotatE, require appropriate adjustment of the margin term or the number of negative samples different from those without restricted value ranges, such as RESCAL, ComplEx, and DistMult. We also propose subsampling methods specialized for the NS loss in KGE studied from a theoretical aspect. Our empirical analysis on the FB15k-237, WN18RR, and YAGO3-10 datasets showed that the results of actually trained models agree with our theoretical findings.}\n}", "pdf": "https://proceedings.mlr.press/v162/kamigaito22a/kamigaito22a.pdf", "supp": "", "pdf_size": 549631, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4661195844634999621&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Nara Institute of Science and Technology (NAIST), Nara, Japan; Hokkaido University, Hokkaido, Japan", "aff_domain": "is.naist.jp; ", "email": "is.naist.jp; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/kamigaito22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Nara Institute of Science and Technology;Hokkaido University", "aff_unique_dep": ";", "aff_unique_url": "https://www.naist.jp;https://www.hokudai.ac.jp", "aff_unique_abbr": "NAIST;Hokkaido U.", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Nara;Hokkaido", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "Compressed-VFL: Communication-Efficient Learning with Vertically Partitioned Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16395", "id": "16395", "proceeding": "https://proceedings.mlr.press/v162/castiglia22a.html", "poster": "/media/PosterPDFs/ICML%202022/5726daf2c9ee0f955eca58291c26d2f3_q3o0MBJ.png?t=1657216429.6136196", "slides": "/media/icml-2022/Slides/16395_EdvELVq.pdf", "author_site": "Timothy Castiglia, Anirban Das, Shiqiang Wang, Stacy Patterson", "author": "Timothy J Castiglia; Anirban Das; Shiqiang Wang; Stacy Patterson", "abstract": "We propose Compressed Vertical Federated Learning (C-VFL) for communication-efficient training on vertically partitioned data. In C-VFL, a server and multiple parties collaboratively train a model on their respective features utilizing several local iterations and sharing compressed intermediate results periodically. Our work provides the first theoretical analysis of the effect message compression has on distributed training over vertically partitioned data. We prove convergence of non-convex objectives at a rate of $O(\\frac{1}{\\sqrt{T}})$ when the compression error is bounded over the course of training. We provide specific requirements for convergence with common compression techniques, such as quantization and top-$k$ sparsification. Finally, we experimentally show compression can reduce communication by over $90%$ without a significant decrease in accuracy over VFL without compression.", "bibtex": "@InProceedings{pmlr-v162-castiglia22a,\n title = \t {Compressed-{VFL}: Communication-Efficient Learning with Vertically Partitioned Data},\n author = {Castiglia, Timothy J and Das, Anirban and Wang, Shiqiang and Patterson, Stacy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2738--2766},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/castiglia22a/castiglia22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/castiglia22a.html},\n abstract = \t {We propose Compressed Vertical Federated Learning (C-VFL) for communication-efficient training on vertically partitioned data. In C-VFL, a server and multiple parties collaboratively train a model on their respective features utilizing several local iterations and sharing compressed intermediate results periodically. Our work provides the first theoretical analysis of the effect message compression has on distributed training over vertically partitioned data. We prove convergence of non-convex objectives at a rate of $O(\\frac{1}{\\sqrt{T}})$ when the compression error is bounded over the course of training. We provide specific requirements for convergence with common compression techniques, such as quantization and top-$k$ sparsification. Finally, we experimentally show compression can reduce communication by over $90%$ without a significant decrease in accuracy over VFL without compression.}\n}", "pdf": "https://proceedings.mlr.press/v162/castiglia22a/castiglia22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/castiglia22a-supp.zip", "pdf_size": 3118657, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7143121005409493886&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Rensselaer Polytechnic Institute, Troy, NY, USA; Department of Computer Science, Rensselaer Polytechnic Institute, Troy, NY, USA; IBM Thomas J. Watson Research Center, Yorktown Heights, NY, USA; Department of Computer Science, Rensselaer Polytechnic Institute, Troy, NY, USA", "aff_domain": "rpi.edu; ; ; ", "email": "rpi.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/castiglia22a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Rensselaer Polytechnic Institute;IBM", "aff_unique_dep": "Department of Computer Science;IBM Thomas J. Watson Research Center", "aff_unique_url": "https://www.rpi.edu;https://www.ibm.com/research/watson", "aff_unique_abbr": "RPI;IBM Watson", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Troy;Yorktown Heights", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Conditional GANs with Auxiliary Discriminative Classifier", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17139", "id": "17139", "proceeding": "https://proceedings.mlr.press/v162/hou22a.html", "poster": "/media/PosterPDFs/ICML%202022/24896ee4c6526356cc127852413ea3b4_ajreJYt.png?t=1657376614.9827688", "slides": "", "author_site": "Liang Hou, Qi Cao, Huawei Shen, Siyuan Pan, Xiaoshuang Li, Xueqi Cheng", "author": "Liang Hou; Qi Cao; Huawei Shen; Siyuan Pan; Xiaoshuang Li; Xueqi Cheng", "abstract": "Conditional generative models aim to learn the underlying joint distribution of data and labels to achieve conditional data generation. Among them, the auxiliary classifier generative adversarial network (AC-GAN) has been widely used, but suffers from the problem of low intra-class diversity of the generated samples. The fundamental reason pointed out in this paper is that the classifier of AC-GAN is generator-agnostic, which therefore cannot provide informative guidance for the generator to approach the joint distribution, resulting in a minimization of the conditional entropy that decreases the intra-class diversity. Motivated by this understanding, we propose a novel conditional GAN with an auxiliary discriminative classifier (ADC-GAN) to resolve the above problem. Specifically, the proposed auxiliary discriminative classifier becomes generator-aware by recognizing the class-labels of the real data and the generated data discriminatively. Our theoretical analysis reveals that the generator can faithfully learn the joint distribution even without the original discriminator, making the proposed ADC-GAN robust to the value of the coefficient hyperparameter and the selection of the GAN loss, and stable during training. Extensive experimental results on synthetic and real-world datasets demonstrate the superiority of ADC-GAN in conditional generative modeling compared to state-of-the-art classifier-based and projection-based conditional GANs.", "bibtex": "@InProceedings{pmlr-v162-hou22a,\n title = \t {Conditional {GAN}s with Auxiliary Discriminative Classifier},\n author = {Hou, Liang and Cao, Qi and Shen, Huawei and Pan, Siyuan and Li, Xiaoshuang and Cheng, Xueqi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8888--8902},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hou22a/hou22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hou22a.html},\n abstract = \t {Conditional generative models aim to learn the underlying joint distribution of data and labels to achieve conditional data generation. Among them, the auxiliary classifier generative adversarial network (AC-GAN) has been widely used, but suffers from the problem of low intra-class diversity of the generated samples. The fundamental reason pointed out in this paper is that the classifier of AC-GAN is generator-agnostic, which therefore cannot provide informative guidance for the generator to approach the joint distribution, resulting in a minimization of the conditional entropy that decreases the intra-class diversity. Motivated by this understanding, we propose a novel conditional GAN with an auxiliary discriminative classifier (ADC-GAN) to resolve the above problem. Specifically, the proposed auxiliary discriminative classifier becomes generator-aware by recognizing the class-labels of the real data and the generated data discriminatively. Our theoretical analysis reveals that the generator can faithfully learn the joint distribution even without the original discriminator, making the proposed ADC-GAN robust to the value of the coefficient hyperparameter and the selection of the GAN loss, and stable during training. Extensive experimental results on synthetic and real-world datasets demonstrate the superiority of ADC-GAN in conditional generative modeling compared to state-of-the-art classifier-based and projection-based conditional GANs.}\n}", "pdf": "https://proceedings.mlr.press/v162/hou22a/hou22a.pdf", "supp": "", "pdf_size": 954806, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=868024013198158367&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Data Intelligence System Research Center, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China; Data Intelligence System Research Center, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China; Data Intelligence System Research Center, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China; Shanghai Jiao Tong University, Shanghai, China; Shanghai Jiao Tong University, Shanghai, China; CAS Key Laboratory of Network Data Science and Technology, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China + University of Chinese Academy of Sciences, Beijing, China", "aff_domain": "ict.ac.cn;ict.ac.cn;ict.ac.cn;sjtu.edu.cn;sjtu.edu.cn;ict.ac.cn", "email": "ict.ac.cn;ict.ac.cn;ict.ac.cn;sjtu.edu.cn;sjtu.edu.cn;ict.ac.cn", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/hou22a.html", "aff_unique_index": "0+1;0+1;0+1;2;2;0+1", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Shanghai Jiao Tong University", "aff_unique_dep": "Institute of Computing Technology;;", "aff_unique_url": "http://www.cas.cn;http://www.ucas.ac.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "CAS;UCAS;SJTU", "aff_campus_unique_index": "0+0;0+0;0+0;1;1;0+0", "aff_campus_unique": "Beijing;Shanghai", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0+0", "aff_country_unique": "China" }, { "title": "Confidence Score for Source-Free Unsupervised Domain Adaptation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17275", "id": "17275", "proceeding": "https://proceedings.mlr.press/v162/lee22c.html", "poster": "/media/PosterPDFs/ICML%202022/59b90e1005a220e2ebc542eb9d950b1e_wQFyv0r.png?t=1657607433.663796", "slides": "", "author_site": "Jonghyun Lee, Dahuin Jung, Junho Yim, Sungroh Yoon", "author": "Jonghyun Lee; Dahuin Jung; Junho Yim; Sungroh Yoon", "abstract": "Source-free unsupervised domain adaptation (SFUDA) aims to obtain high performance in the unlabeled target domain using the pre-trained source model, not the source data. Existing SFUDA methods assign the same importance to all target samples, which is vulnerable to incorrect pseudo-labels. To differentiate between sample importance, in this study, we propose a novel sample-wise confidence score, the Joint Model-Data Structure (JMDS) score for SFUDA. Unlike existing confidence scores that use only one of the source or target domain knowledge, the JMDS score uses both knowledge. We then propose a Confidence score Weighting Adaptation using the JMDS (CoWA-JMDS) framework for SFUDA. CoWA-JMDS consists of the JMDS scores as sample weights and weight Mixup that is our proposed variant of Mixup. Weight Mixup promotes the model make more use of the target domain knowledge. The experimental results show that the JMDS score outperforms the existing confidence scores. Moreover, CoWA-JMDS achieves state-of-the-art performance on various SFUDA scenarios: closed, open, and partial-set scenarios.", "bibtex": "@InProceedings{pmlr-v162-lee22c,\n title = \t {Confidence Score for Source-Free Unsupervised Domain Adaptation},\n author = {Lee, Jonghyun and Jung, Dahuin and Yim, Junho and Yoon, Sungroh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12365--12377},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lee22c/lee22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/lee22c.html},\n abstract = \t {Source-free unsupervised domain adaptation (SFUDA) aims to obtain high performance in the unlabeled target domain using the pre-trained source model, not the source data. Existing SFUDA methods assign the same importance to all target samples, which is vulnerable to incorrect pseudo-labels. To differentiate between sample importance, in this study, we propose a novel sample-wise confidence score, the Joint Model-Data Structure (JMDS) score for SFUDA. Unlike existing confidence scores that use only one of the source or target domain knowledge, the JMDS score uses both knowledge. We then propose a Confidence score Weighting Adaptation using the JMDS (CoWA-JMDS) framework for SFUDA. CoWA-JMDS consists of the JMDS scores as sample weights and weight Mixup that is our proposed variant of Mixup. Weight Mixup promotes the model make more use of the target domain knowledge. The experimental results show that the JMDS score outperforms the existing confidence scores. Moreover, CoWA-JMDS achieves state-of-the-art performance on various SFUDA scenarios: closed, open, and partial-set scenarios.}\n}", "pdf": "https://proceedings.mlr.press/v162/lee22c/lee22c.pdf", "supp": "", "pdf_size": 4417073, "gs_citation": 115, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10361966623265648313&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Data Science and AI Lab., Seoul National University; Data Science and AI Lab., Seoul National University; AIRS Company, Hyundai Motor Group, Seoul, Korea; Department of ECE and Interdisciplinary Program in AI, Seoul National University", "aff_domain": "snu.ac.kr; ; ;snu.ac.kr", "email": "snu.ac.kr; ; ;snu.ac.kr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lee22c.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Seoul National University;Hyundai Motor Group", "aff_unique_dep": "Data Science and AI Lab.;AIRS Company", "aff_unique_url": "https://www.snu.ac.kr;https://www.hyundai.com", "aff_unique_abbr": "SNU;HMG", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Conformal Prediction Sets with Limited False Positives", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17299", "id": "17299", "proceeding": "https://proceedings.mlr.press/v162/fisch22a.html", "poster": "", "slides": "", "author_site": "Adam Fisch, Tal Schuster, Tommi Jaakkola, Regina Barzilay", "author": "Adam Fisch; Tal Schuster; Tommi Jaakkola; Dr.Regina Barzilay", "abstract": "We develop a new approach to multi-label conformal prediction in which we aim to output a precise set of promising prediction candidates with a bounded number of incorrect answers. Standard conformal prediction provides the ability to adapt to model uncertainty by constructing a calibrated candidate set in place of a single prediction, with guarantees that the set contains the correct answer with high probability. In order to obey this coverage property, however, conformal sets can become inundated with noisy candidates\u2014which can render them unhelpful in practice. This is particularly relevant to practical applications where there is a limited budget, and the cost (monetary or otherwise) associated with false positives is non-negligible. We propose to trade coverage for a notion of precision by enforcing that the presence of incorrect candidates in the predicted conformal sets (i.e., the total number of false positives) is bounded according to a user-specified tolerance. Subject to this constraint, our algorithm then optimizes for a generalized notion of set coverage (i.e., the true positive rate) that allows for any number of true answers for a given query (including zero). We demonstrate the effectiveness of this approach across a number of classification tasks in natural language processing, computer vision, and computational chemistry.", "bibtex": "@InProceedings{pmlr-v162-fisch22a,\n title = \t {Conformal Prediction Sets with Limited False Positives},\n author = {Fisch, Adam and Schuster, Tal and Jaakkola, Tommi and Barzilay, Dr.Regina},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6514--6532},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fisch22a/fisch22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/fisch22a.html},\n abstract = \t {We develop a new approach to multi-label conformal prediction in which we aim to output a precise set of promising prediction candidates with a bounded number of incorrect answers. Standard conformal prediction provides the ability to adapt to model uncertainty by constructing a calibrated candidate set in place of a single prediction, with guarantees that the set contains the correct answer with high probability. In order to obey this coverage property, however, conformal sets can become inundated with noisy candidates\u2014which can render them unhelpful in practice. This is particularly relevant to practical applications where there is a limited budget, and the cost (monetary or otherwise) associated with false positives is non-negligible. We propose to trade coverage for a notion of precision by enforcing that the presence of incorrect candidates in the predicted conformal sets (i.e., the total number of false positives) is bounded according to a user-specified tolerance. Subject to this constraint, our algorithm then optimizes for a generalized notion of set coverage (i.e., the true positive rate) that allows for any number of true answers for a given query (including zero). We demonstrate the effectiveness of this approach across a number of classification tasks in natural language processing, computer vision, and computational chemistry.}\n}", "pdf": "https://proceedings.mlr.press/v162/fisch22a/fisch22a.pdf", "supp": "", "pdf_size": 3072336, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3023340906965759657&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "CSAIL, Massachusetts Institute of Technology; Google Research; CSAIL, Massachusetts Institute of Technology; CSAIL, Massachusetts Institute of Technology", "aff_domain": "csail.mit.edu; ; ; ", "email": "csail.mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/fisch22a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Google", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;Google Research", "aff_unique_url": "https://www.csail.mit.edu;https://research.google", "aff_unique_abbr": "MIT;Google Research", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Cambridge;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Congested Bandits: Optimal Routing via Short-term Resets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16065", "id": "16065", "proceeding": "https://proceedings.mlr.press/v162/awasthi22a.html", "poster": "", "slides": "", "author_site": "Pranjal Awasthi, Kush Bhatia, Sreenivas Gollapudi, Kostas Kollias", "author": "Pranjal Awasthi; Kush Bhatia; Sreenivas Gollapudi; Kostas Kollias", "abstract": "For traffic routing platforms, the choice of which route to recommend to a user depends on the congestion on these routes \u2013 indeed, an individual\u2019s utility depends on the number of people using the recommended route at that instance. Motivated by this, we introduce the problem of Congested Bandits where each arm\u2019s reward is allowed to depend on the number of times it was played in the past $\\Delta$ timesteps. This dependence on past history of actions leads to a dynamical system where an algorithm\u2019s present choices also affect its future pay-offs, and requires an algorithm to plan for this. We study the congestion aware formulation in the multi-armed bandit (MAB) setup and in the contextual bandit setup with linear rewards. For the multi-armed setup, we propose a UCB style algorithm and show that its policy regret scales as $\\tilde{O}(\\sqrt{K \\Delta T})$. For the linear contextual bandit setup, our algorithm, based on an iterative least squares planner, achieves policy regret $\\tilde{O}(\\sqrt{dT} + \\Delta)$. From an experimental standpoint, we corroborate the no-regret properties of our algorithms via a simulation study.", "bibtex": "@InProceedings{pmlr-v162-awasthi22a,\n title = \t {Congested Bandits: Optimal Routing via Short-term Resets},\n author = {Awasthi, Pranjal and Bhatia, Kush and Gollapudi, Sreenivas and Kollias, Kostas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1078--1100},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/awasthi22a/awasthi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/awasthi22a.html},\n abstract = \t {For traffic routing platforms, the choice of which route to recommend to a user depends on the congestion on these routes \u2013 indeed, an individual\u2019s utility depends on the number of people using the recommended route at that instance. Motivated by this, we introduce the problem of Congested Bandits where each arm\u2019s reward is allowed to depend on the number of times it was played in the past $\\Delta$ timesteps. This dependence on past history of actions leads to a dynamical system where an algorithm\u2019s present choices also affect its future pay-offs, and requires an algorithm to plan for this. We study the congestion aware formulation in the multi-armed bandit (MAB) setup and in the contextual bandit setup with linear rewards. For the multi-armed setup, we propose a UCB style algorithm and show that its policy regret scales as $\\tilde{O}(\\sqrt{K \\Delta T})$. For the linear contextual bandit setup, our algorithm, based on an iterative least squares planner, achieves policy regret $\\tilde{O}(\\sqrt{dT} + \\Delta)$. From an experimental standpoint, we corroborate the no-regret properties of our algorithms via a simulation study.}\n}", "pdf": "https://proceedings.mlr.press/v162/awasthi22a/awasthi22a.pdf", "supp": "", "pdf_size": 691236, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18422691580587375206&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Google Research; UC Berkeley; Google Research; Google Research", "aff_domain": "berkeley.edu; ; ; ", "email": "berkeley.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/awasthi22a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Google;University of California, Berkeley", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://www.berkeley.edu", "aff_unique_abbr": "Google Research;UC Berkeley", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Mountain View;Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Connect, Not Collapse: Explaining Contrastive Learning for Unsupervised Domain Adaptation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18405", "id": "18405", "proceeding": "https://proceedings.mlr.press/v162/shen22d.html", "poster": "/media/PosterPDFs/ICML%202022/f79921bbae40a577928b76d2fc3edc2a.png?t=1657906784.014832", "slides": "/media/icml-2022/Slides/18405.pdf", "author_site": "Kendrick Shen, Robbie Jones, Ananya Kumar, Sang Michael Xie, Jeff Z. HaoChen, Tengyu Ma, Percy Liang", "author": "Kendrick Shen; Robbie M Jones; Ananya Kumar; Sang Michael Xie; Jeff Z. Haochen; Tengyu Ma; Percy Liang", "abstract": "We consider unsupervised domain adaptation (UDA), where labeled data from a source domain (e.g., photos) and unlabeled data from a target domain (e.g., sketches) are used to learn a classifier for the target domain. Conventional UDA methods (e.g., domain adversarial training) learn domain-invariant features to generalize from the source domain to the target domain. In this paper, we show that contrastive pre-training, which learns features on unlabeled source and target data and then fine-tunes on labeled source data, is competitive with strong UDA methods. However, we find that contrastive pre-training does not learn domain-invariant features, diverging from conventional UDA intuitions. We show theoretically that contrastive pre-training can learn features that vary subtantially across domains but still generalize to the target domain, by disentangling domain and class information. We empirically validate our theory on benchmark vision datasets.", "bibtex": "@InProceedings{pmlr-v162-shen22d,\n title = \t {Connect, Not Collapse: Explaining Contrastive Learning for Unsupervised Domain Adaptation},\n author = {Shen, Kendrick and Jones, Robbie M and Kumar, Ananya and Xie, Sang Michael and Haochen, Jeff Z. and Ma, Tengyu and Liang, Percy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19847--19878},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shen22d/shen22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/shen22d.html},\n abstract = \t {We consider unsupervised domain adaptation (UDA), where labeled data from a source domain (e.g., photos) and unlabeled data from a target domain (e.g., sketches) are used to learn a classifier for the target domain. Conventional UDA methods (e.g., domain adversarial training) learn domain-invariant features to generalize from the source domain to the target domain. In this paper, we show that contrastive pre-training, which learns features on unlabeled source and target data and then fine-tunes on labeled source data, is competitive with strong UDA methods. However, we find that contrastive pre-training does not learn domain-invariant features, diverging from conventional UDA intuitions. We show theoretically that contrastive pre-training can learn features that vary subtantially across domains but still generalize to the target domain, by disentangling domain and class information. We empirically validate our theory on benchmark vision datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/shen22d/shen22d.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/shen22d-supp.zip", "pdf_size": 2749154, "gs_citation": 122, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10698421062095610581&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University", "aff_domain": "cs.stanford.edu;cs.stanford.edu;cs.stanford.edu;cs.stanford.edu; ; ; ", "email": "cs.stanford.edu;cs.stanford.edu;cs.stanford.edu;cs.stanford.edu; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/shen22d.html", "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Consensus Multiplicative Weights Update: Learning to Learn using Projector-based Game Signatures", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16035", "id": "16035", "proceeding": "https://proceedings.mlr.press/v162/vadori22a.html", "poster": "/media/PosterPDFs/ICML%202022/05a70454516ecd9194c293b0e415777f.png?t=1657579985.5544415", "slides": "", "author_site": "Nelson Vadori, Rahul Savani, Thomas Spooner, Sumitra Ganesh", "author": "Nelson Vadori; Rahul Savani; Thomas Spooner; Sumitra Ganesh", "abstract": "Cheung and Piliouras (2020) recently showed that two variants of the Multiplicative Weights Update method - OMWU and MWU - display opposite convergence properties depending on whether the game is zero-sum or cooperative. Inspired by this work and the recent literature on learning to optimize for single functions, we introduce a new framework for learning last-iterate convergence to Nash Equilibria in games, where the update rule\u2019s coefficients (learning rates) along a trajectory are learnt by a reinforcement learning policy that is conditioned on the nature of the game:", "bibtex": "@InProceedings{pmlr-v162-vadori22a,\n title = \t {Consensus Multiplicative Weights Update: Learning to Learn using Projector-based Game Signatures},\n author = {Vadori, Nelson and Savani, Rahul and Spooner, Thomas and Ganesh, Sumitra},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21901--21926},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vadori22a/vadori22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/vadori22a.html},\n abstract = \t {Cheung and Piliouras (2020) recently showed that two variants of the Multiplicative Weights Update method - OMWU and MWU - display opposite convergence properties depending on whether the game is zero-sum or cooperative. Inspired by this work and the recent literature on learning to optimize for single functions, we introduce a new framework for learning last-iterate convergence to Nash Equilibria in games, where the update rule\u2019s coefficients (learning rates) along a trajectory are learnt by a reinforcement learning policy that is conditioned on the nature of the game:", "pdf": "https://proceedings.mlr.press/v162/vadori22a/vadori22a.pdf", "supp": "", "pdf_size": 1632401, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14829608370189967195&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "J.P. Morgan AI Research; Department of Computer Science, University of Liverpool; J.P. Morgan AI Research; J.P. Morgan AI Research", "aff_domain": "jpmorgan.com; ; ; ", "email": "jpmorgan.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/vadori22a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "J.P. Morgan;University of Liverpool", "aff_unique_dep": "AI Research;Department of Computer Science", "aff_unique_url": "https://www.jpmorgan.com;https://www.liverpool.ac.uk", "aff_unique_abbr": "JPM;Liv Uni", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Consistent Polyhedral Surrogates for Top-k Classification and Variants", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17293", "id": "17293", "proceeding": "https://proceedings.mlr.press/v162/thilagar22a.html", "poster": "/media/PosterPDFs/ICML%202022/c236337b043acf93c7df397fdb9082b3.png?t=1658040787.258773", "slides": "", "author_site": "Anish Thilagar, Rafael Frongillo, Jessie Finocchiaro, Emma Goodwill", "author": "Anish Thilagar; Rafael Frongillo; Jessica J Finocchiaro; Emma Goodwill", "abstract": "Top-$k$ classification is a generalization of multiclass classification used widely in information retrieval, image classification, and other extreme classification settings. Several hinge-like (piecewise-linear) surrogates have been proposed for the problem, yet all are either non-convex or inconsistent. For the proposed hinge-like surrogates that are convex (i.e., polyhedral), we apply the recent embedding framework of Finocchiaro et al. (2019; 2022) to determine the prediction problem for which the surrogate is consistent. These problems can all be interpreted as variants of top-$k$ classification, which may be better aligned with some applications. We leverage this analysis to derive constraints on the conditional label distributions under which these proposed surrogates become consistent for top-$k$. It has been further suggested that every convex hinge-like surrogate must be inconsistent for top-$k$. Yet, we use the same embedding framework to give the first consistent polyhedral surrogate for this problem.", "bibtex": "@InProceedings{pmlr-v162-thilagar22a,\n title = \t {Consistent Polyhedral Surrogates for Top-k Classification and Variants},\n author = {Thilagar, Anish and Frongillo, Rafael and Finocchiaro, Jessica J and Goodwill, Emma},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21329--21359},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/thilagar22a/thilagar22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/thilagar22a.html},\n abstract = \t {Top-$k$ classification is a generalization of multiclass classification used widely in information retrieval, image classification, and other extreme classification settings. Several hinge-like (piecewise-linear) surrogates have been proposed for the problem, yet all are either non-convex or inconsistent. For the proposed hinge-like surrogates that are convex (i.e., polyhedral), we apply the recent embedding framework of Finocchiaro et al. (2019; 2022) to determine the prediction problem for which the surrogate is consistent. These problems can all be interpreted as variants of top-$k$ classification, which may be better aligned with some applications. We leverage this analysis to derive constraints on the conditional label distributions under which these proposed surrogates become consistent for top-$k$. It has been further suggested that every convex hinge-like surrogate must be inconsistent for top-$k$. Yet, we use the same embedding framework to give the first consistent polyhedral surrogate for this problem.}\n}", "pdf": "https://proceedings.mlr.press/v162/thilagar22a/thilagar22a.pdf", "supp": "", "pdf_size": 659021, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=657859179896724623&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "University of Colorado Boulder Department of Computer Science, Boulder, CO, USA; University of Colorado Boulder Department of Computer Science, Boulder, CO, USA; University of Colorado Boulder Department of Computer Science, Boulder, CO, USA; University of Colorado Boulder Department of Computer Science, Boulder, CO, USA", "aff_domain": "colorado.edu; ; ;colorado.edu", "email": "colorado.edu; ; ;colorado.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/thilagar22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Colorado Boulder", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.colorado.edu", "aff_unique_abbr": "CU Boulder", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Boulder", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Constants Matter: The Performance Gains of Active Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17609", "id": "17609", "proceeding": "https://proceedings.mlr.press/v162/mussmann22a.html", "poster": "/media/PosterPDFs/ICML%202022/ccf8111910291ba472b385e9c5f59099.png?t=1657682365.048881", "slides": "", "author_site": "Stephen Mussmann, Sanjoy Dasgupta", "author": "Stephen O Mussmann; Sanjoy Dasgupta", "abstract": "Within machine learning, active learning studies the gains in performance made possible by adaptively selecting data points to label. In this work, we show through upper and lower bounds, that for a simple benign setting of well-specified logistic regression on a uniform distribution over a sphere, the expected excess error of both active learning and random sampling have the same inverse proportional dependence on the number of samples. Importantly, due to the nature of lower bounds, any more general setting does not allow a better dependence on the number of samples. Additionally, we show a variant of uncertainty sampling can achieve a faster rate of convergence than random sampling by a factor of the Bayes error, a recent empirical observation made by other work. Qualitatively, this work is pessimistic with respect to the asymptotic dependence on the number of samples, but optimistic with respect to finding performance gains in the constants.", "bibtex": "@InProceedings{pmlr-v162-mussmann22a,\n title = \t {Constants Matter: The Performance Gains of Active Learning},\n author = {Mussmann, Stephen O and Dasgupta, Sanjoy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16123--16173},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mussmann22a/mussmann22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mussmann22a.html},\n abstract = \t {Within machine learning, active learning studies the gains in performance made possible by adaptively selecting data points to label. In this work, we show through upper and lower bounds, that for a simple benign setting of well-specified logistic regression on a uniform distribution over a sphere, the expected excess error of both active learning and random sampling have the same inverse proportional dependence on the number of samples. Importantly, due to the nature of lower bounds, any more general setting does not allow a better dependence on the number of samples. Additionally, we show a variant of uncertainty sampling can achieve a faster rate of convergence than random sampling by a factor of the Bayes error, a recent empirical observation made by other work. Qualitatively, this work is pessimistic with respect to the asymptotic dependence on the number of samples, but optimistic with respect to finding performance gains in the constants.}\n}", "pdf": "https://proceedings.mlr.press/v162/mussmann22a/mussmann22a.pdf", "supp": "", "pdf_size": 636000, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5283743905652759512&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Paul G. Allen School of Computer Science & Engineering, University of Washington, Seattle, USA; Computer Science & Engineering, University of California, San Diego, USA", "aff_domain": "cs.washington.edu; ", "email": "cs.washington.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/mussmann22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of Washington;University of California, San Diego", "aff_unique_dep": "Paul G. Allen School of Computer Science & Engineering;Computer Science & Engineering", "aff_unique_url": "https://www.washington.edu;https://www.ucsd.edu", "aff_unique_abbr": "UW;UCSD", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Seattle;San Diego", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Constrained Discrete Black-Box Optimization using Mixed-Integer Programming", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16559", "id": "16559", "proceeding": "https://proceedings.mlr.press/v162/papalexopoulos22a.html", "poster": "/media/PosterPDFs/ICML%202022/5cc4bb753030a3d804351b2dfec0d8b5.png?t=1657896240.676478", "slides": "", "author_site": "Theodore Papalexopoulos, Christian Tjandraatmadja, Ross Anderson, Juan Pablo Vielma, David Belanger", "author": "Theodore P Papalexopoulos; Christian Tjandraatmadja; Ross Anderson; Juan Pablo Vielma; David Belanger", "abstract": "Discrete black-box optimization problems are challenging for model-based optimization (MBO) algorithms, such as Bayesian optimization, due to the size of the search space and the need to satisfy combinatorial constraints. In particular, these methods require repeatedly solving a complex discrete global optimization problem in the inner loop, where popular heuristic inner-loop solvers introduce approximations and are difficult to adapt to combinatorial constraints. In response, we propose NN+MILP, a general discrete MBO framework using piecewise-linear neural networks as surrogate models and mixed-integer linear programming (MILP) to optimize the acquisition function. MILP provides optimality guarantees and a versatile declarative language for domain-specific constraints. We test our approach on a range of unconstrained and constrained problems, including DNA binding, constrained binary quadratic problems from the MINLPLib benchmark, and the NAS-Bench-101 neural architecture search benchmark. NN+MILP surpasses or matches the performance of black-box algorithms tailored to the constraints at hand, with global optimization of the acquisition problem running in a few minutes using only standard software packages and hardware.", "bibtex": "@InProceedings{pmlr-v162-papalexopoulos22a,\n title = \t {Constrained Discrete Black-Box Optimization using Mixed-Integer Programming},\n author = {Papalexopoulos, Theodore P and Tjandraatmadja, Christian and Anderson, Ross and Vielma, Juan Pablo and Belanger, David},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17295--17322},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/papalexopoulos22a/papalexopoulos22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/papalexopoulos22a.html},\n abstract = \t {Discrete black-box optimization problems are challenging for model-based optimization (MBO) algorithms, such as Bayesian optimization, due to the size of the search space and the need to satisfy combinatorial constraints. In particular, these methods require repeatedly solving a complex discrete global optimization problem in the inner loop, where popular heuristic inner-loop solvers introduce approximations and are difficult to adapt to combinatorial constraints. In response, we propose NN+MILP, a general discrete MBO framework using piecewise-linear neural networks as surrogate models and mixed-integer linear programming (MILP) to optimize the acquisition function. MILP provides optimality guarantees and a versatile declarative language for domain-specific constraints. We test our approach on a range of unconstrained and constrained problems, including DNA binding, constrained binary quadratic problems from the MINLPLib benchmark, and the NAS-Bench-101 neural architecture search benchmark. NN+MILP surpasses or matches the performance of black-box algorithms tailored to the constraints at hand, with global optimization of the acquisition problem running in a few minutes using only standard software packages and hardware.}\n}", "pdf": "https://proceedings.mlr.press/v162/papalexopoulos22a/papalexopoulos22a.pdf", "supp": "", "pdf_size": 5270714, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4611976570746888116&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Operations Research Center, Massachusetts Institute of Technology, Cambridge MA, USA+Google Research, Cambridge MA, USA; Google Research, Cambridge MA, USA; Google Research, Cambridge MA, USA; Google Research, Cambridge MA, USA; Google Research, Cambridge MA, USA", "aff_domain": "gmail.com; ; ; ; ", "email": "gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/papalexopoulos22a.html", "aff_unique_index": "0+1;1;1;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;Google", "aff_unique_dep": "Operations Research Center;Google Research", "aff_unique_url": "https://web.mit.edu;https://research.google", "aff_unique_abbr": "MIT;Google", "aff_campus_unique_index": "0+0;0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Constrained Gradient Descent: A Powerful and Principled Evasion Attack Against Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16715", "id": "16715", "proceeding": "https://proceedings.mlr.press/v162/lin22e.html", "poster": "/media/PosterPDFs/ICML%202022/4daa3db355ef2b0e64b472968cb70f0d.png?t=1657584129.1820464", "slides": "", "author_site": "Weiran Lin, Keane Lucas, Lujo Bauer, Michael Reiter, Mahmood Sharif", "author": "Weiran Lin; Keane Lucas; Lujo Bauer; Michael K. Reiter; Mahmood Sharif", "abstract": "We propose new, more efficient targeted white-box attacks against deep neural networks. Our attacks better align with the attacker\u2019s goal: (1) tricking a model to assign higher probability to the target class than to any other class, while (2) staying within an $\\epsilon$-distance of the attacked input. First, we demonstrate a loss function that explicitly encodes (1) and show that Auto-PGD finds more attacks with it. Second, we propose a new attack method, Constrained Gradient Descent (CGD), using a refinement of our loss function that captures both (1) and (2). CGD seeks to satisfy both attacker objectives\u2014misclassification and bounded $\\ell_{p}$-norm\u2014in a principled manner, as part of the optimization, instead of via ad hoc post-processing techniques (e.g., projection or clipping). We show that CGD is more successful on CIFAR10 (0.9\u20134.2%) and ImageNet (8.6\u201313.6%) than state-of-the-art attacks while consuming less time (11.4\u201318.8%). Statistical tests confirm that our attack outperforms others against leading defenses on different datasets and values of $\\epsilon$.", "bibtex": "@InProceedings{pmlr-v162-lin22e,\n title = \t {Constrained Gradient Descent: A Powerful and Principled Evasion Attack Against Neural Networks},\n author = {Lin, Weiran and Lucas, Keane and Bauer, Lujo and Reiter, Michael K. and Sharif, Mahmood},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13405--13430},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lin22e/lin22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/lin22e.html},\n abstract = \t {We propose new, more efficient targeted white-box attacks against deep neural networks. Our attacks better align with the attacker\u2019s goal: (1) tricking a model to assign higher probability to the target class than to any other class, while (2) staying within an $\\epsilon$-distance of the attacked input. First, we demonstrate a loss function that explicitly encodes (1) and show that Auto-PGD finds more attacks with it. Second, we propose a new attack method, Constrained Gradient Descent (CGD), using a refinement of our loss function that captures both (1) and (2). CGD seeks to satisfy both attacker objectives\u2014misclassification and bounded $\\ell_{p}$-norm\u2014in a principled manner, as part of the optimization, instead of via ad hoc post-processing techniques (e.g., projection or clipping). We show that CGD is more successful on CIFAR10 (0.9\u20134.2%) and ImageNet (8.6\u201313.6%) than state-of-the-art attacks while consuming less time (11.4\u201318.8%). Statistical tests confirm that our attack outperforms others against leading defenses on different datasets and values of $\\epsilon$.}\n}", "pdf": "https://proceedings.mlr.press/v162/lin22e/lin22e.pdf", "supp": "", "pdf_size": 12912660, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14082433359159261518&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Electrical & Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, US + Cylab, Carnegie Mellon University, Pittsburgh, PA, US + Institute for Software Research, Carnegie Mellon University, Pittsburgh, PA, US; Department of Electrical & Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, US + Cylab, Carnegie Mellon University, Pittsburgh, PA, US + Institute for Software Research, Carnegie Mellon University, Pittsburgh, PA, US; Department of Electrical & Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, US + Cylab, Carnegie Mellon University, Pittsburgh, PA, US + Institute for Software Research, Carnegie Mellon University, Pittsburgh, PA, US; Departments of Computer Science and Electrical & Computer Engineering, Duke University, Durham, NC, US; School of Computer Science, Tel Aviv University, Tel Aviv, Israel", "aff_domain": "andrew.cmu.edu;andrew.cmu.edu;cmu.edu;duke.edu;cs.tau.ac.il", "email": "andrew.cmu.edu;andrew.cmu.edu;cmu.edu;duke.edu;cs.tau.ac.il", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/lin22e.html", "aff_unique_index": "0+0+0;0+0+0;0+0+0;1;2", "aff_unique_norm": "Carnegie Mellon University;Duke University;Tel Aviv University", "aff_unique_dep": "Department of Electrical & Computer Engineering;Departments of Computer Science and Electrical & Computer Engineering;School of Computer Science", "aff_unique_url": "https://www.cmu.edu;https://www.duke.edu;https://www.tau.ac.il", "aff_unique_abbr": "CMU;Duke;TAU", "aff_campus_unique_index": "0+0+0;0+0+0;0+0+0;1;2", "aff_campus_unique": "Pittsburgh;Durham;Tel Aviv", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0;0;1", "aff_country_unique": "United States;Israel" }, { "title": "Constrained Offline Policy Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16569", "id": "16569", "proceeding": "https://proceedings.mlr.press/v162/polosky22a.html", "poster": "/media/PosterPDFs/ICML%202022/b33128cb0089003ddfb5199e1b679652.png?t=1658270212.5422657", "slides": "", "author_site": "Nicholas Polosky, Bruno C. da Silva, Madalina Fiterau, Jithin Jagannath", "author": "Nicholas Polosky; Bruno C. Da Silva; Madalina Fiterau; Jithin Jagannath", "abstract": "In this work we introduce Constrained Offline Policy Optimization (COPO), an offline policy optimization algorithm for learning in MDPs with cost constraints. COPO is built upon a novel offline cost-projection method, which we formally derive and analyze. Our method improves upon the state-of-the-art in offline constrained policy optimization by explicitly accounting for distributional shift and by offering non-asymptotic confidence bounds on the cost of a policy. These formal properties are superior to those of existing techniques, which only guarantee convergence to a point estimate. We formally analyze our method and empirically demonstrate that it achieves state-of-the-art performance on discrete and continuous control problems, while offering the aforementioned improved, stronger, and more robust theoretical guarantees.", "bibtex": "@InProceedings{pmlr-v162-polosky22a,\n title = \t {Constrained Offline Policy Optimization},\n author = {Polosky, Nicholas and Silva, Bruno C. Da and Fiterau, Madalina and Jagannath, Jithin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17801--17810},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/polosky22a/polosky22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/polosky22a.html},\n abstract = \t {In this work we introduce Constrained Offline Policy Optimization (COPO), an offline policy optimization algorithm for learning in MDPs with cost constraints. COPO is built upon a novel offline cost-projection method, which we formally derive and analyze. Our method improves upon the state-of-the-art in offline constrained policy optimization by explicitly accounting for distributional shift and by offering non-asymptotic confidence bounds on the cost of a policy. These formal properties are superior to those of existing techniques, which only guarantee convergence to a point estimate. We formally analyze our method and empirically demonstrate that it achieves state-of-the-art performance on discrete and continuous control problems, while offering the aforementioned improved, stronger, and more robust theoretical guarantees.}\n}", "pdf": "https://proceedings.mlr.press/v162/polosky22a/polosky22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/polosky22a-supp.zip", "pdf_size": 392937, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3469498996111299555&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "ANDRO Computational Solutions, Rome, NY 13440; University of Massachusetts at Amherst, MA 01003; University of Massachusetts at Amherst, MA 01003; ANDRO Computational Solutions, Rome, NY 13440", "aff_domain": "androcs.com; ; ; ", "email": "androcs.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/polosky22a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "ANDRO Computational Solutions;University of Massachusetts Amherst", "aff_unique_dep": ";", "aff_unique_url": ";https://www.umass.edu", "aff_unique_abbr": ";UMass Amherst", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Constrained Optimization with Dynamic Bound-scaling for Effective NLP Backdoor Defense", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17165", "id": "17165", "proceeding": "https://proceedings.mlr.press/v162/shen22e.html", "poster": "/media/PosterPDFs/ICML%202022/47698c15fb83a1e5bb1400accbb17f82.png?t=1657732540.5650532", "slides": "", "author_site": "Guangyu Shen, Yingqi Liu, Guanhong Tao, Qiuling Xu, ZHUO ZHANG, Shengwei An, Shiqing Ma, Xiangyu Zhang", "author": "Guangyu Shen; Yingqi Liu; Guanhong Tao; Qiuling Xu; Zhuo Zhang; Shengwei An; Shiqing Ma; Xiangyu Zhang", "abstract": "Modern language models are vulnerable to backdoor attacks. An injected malicious token sequence (i.e., a trigger) can cause the compromised model to misbehave, raising security concerns. Trigger inversion is a widely-used technique for scanning backdoors in vision models. It can- not be directly applied to NLP models due to their discrete nature. In this paper, we develop a novel optimization method for NLP backdoor inversion. We leverage a dynamically reducing temperature coefficient in the softmax function to provide changing loss landscapes to the optimizer such that the process gradually focuses on the ground truth trigger, which is denoted as a one-hot value in a convex hull. Our method also features a temperature rollback mechanism to step away from local optimals, exploiting the observation that local optimals can be easily determined in NLP trigger inversion (while not in general optimization). We evaluate the technique on over 1600 models (with roughly half of them having injected backdoors) on 3 prevailing NLP tasks, with 4 different backdoor attacks and 7 architectures. Our results show that the technique is able to effectively and efficiently detect and remove backdoors, outperforming 5 baseline methods. The code is available at https: //github.com/PurduePAML/DBS.", "bibtex": "@InProceedings{pmlr-v162-shen22e,\n title = \t {Constrained Optimization with Dynamic Bound-scaling for Effective {NLP} Backdoor Defense},\n author = {Shen, Guangyu and Liu, Yingqi and Tao, Guanhong and Xu, Qiuling and Zhang, Zhuo and An, Shengwei and Ma, Shiqing and Zhang, Xiangyu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19879--19892},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shen22e/shen22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/shen22e.html},\n abstract = \t {Modern language models are vulnerable to backdoor attacks. An injected malicious token sequence (i.e., a trigger) can cause the compromised model to misbehave, raising security concerns. Trigger inversion is a widely-used technique for scanning backdoors in vision models. It can- not be directly applied to NLP models due to their discrete nature. In this paper, we develop a novel optimization method for NLP backdoor inversion. We leverage a dynamically reducing temperature coefficient in the softmax function to provide changing loss landscapes to the optimizer such that the process gradually focuses on the ground truth trigger, which is denoted as a one-hot value in a convex hull. Our method also features a temperature rollback mechanism to step away from local optimals, exploiting the observation that local optimals can be easily determined in NLP trigger inversion (while not in general optimization). We evaluate the technique on over 1600 models (with roughly half of them having injected backdoors) on 3 prevailing NLP tasks, with 4 different backdoor attacks and 7 architectures. Our results show that the technique is able to effectively and efficiently detect and remove backdoors, outperforming 5 baseline methods. The code is available at https: //github.com/PurduePAML/DBS.}\n}", "pdf": "https://proceedings.mlr.press/v162/shen22e/shen22e.pdf", "supp": "", "pdf_size": 1249689, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3667762180360918452&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "email": ";;;;;;;", "github": "https://github.com/PurduePAML/DBS", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/shen22e.html" }, { "title": "Constrained Variational Policy Optimization for Safe Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16641", "id": "16641", "proceeding": "https://proceedings.mlr.press/v162/liu22b.html", "poster": "/media/PosterPDFs/ICML%202022/f4a331b7a22d1b237565d8813a34d8ac.png?t=1657686058.4601955", "slides": "", "author_site": "Zuxin Liu, Zhepeng Cen, Vladislav Isenbaev, Wei Liu, Steven Wu, Bo Li, Ding Zhao", "author": "Zuxin Liu; Zhepeng Cen; Vladislav Isenbaev; Wei Liu; Steven Wu; Bo Li; Ding Zhao", "abstract": "Safe reinforcement learning (RL) aims to learn policies that satisfy certain constraints before deploying them to safety-critical applications. Previous primal-dual style approaches suffer from instability issues and lack optimality guarantees. This paper overcomes the issues from the perspective of probabilistic inference. We introduce a novel Expectation-Maximization approach to naturally incorporate constraints during the policy learning: 1) a provable optimal non-parametric variational distribution could be computed in closed form after a convex optimization (E-step); 2) the policy parameter is improved within the trust region based on the optimal variational distribution (M-step). The proposed algorithm decomposes the safe RL problem into a convex optimization phase and a supervised learning phase, which yields a more stable training performance. A wide range of experiments on continuous robotic tasks shows that the proposed method achieves significantly better constraint satisfaction performance and better sample efficiency than baselines. The code is available at https://github.com/liuzuxin/cvpo-safe-rl.", "bibtex": "@InProceedings{pmlr-v162-liu22b,\n title = \t {Constrained Variational Policy Optimization for Safe Reinforcement Learning},\n author = {Liu, Zuxin and Cen, Zhepeng and Isenbaev, Vladislav and Liu, Wei and Wu, Steven and Li, Bo and Zhao, Ding},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13644--13668},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22b/liu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22b.html},\n abstract = \t {Safe reinforcement learning (RL) aims to learn policies that satisfy certain constraints before deploying them to safety-critical applications. Previous primal-dual style approaches suffer from instability issues and lack optimality guarantees. This paper overcomes the issues from the perspective of probabilistic inference. We introduce a novel Expectation-Maximization approach to naturally incorporate constraints during the policy learning: 1) a provable optimal non-parametric variational distribution could be computed in closed form after a convex optimization (E-step); 2) the policy parameter is improved within the trust region based on the optimal variational distribution (M-step). The proposed algorithm decomposes the safe RL problem into a convex optimization phase and a supervised learning phase, which yields a more stable training performance. A wide range of experiments on continuous robotic tasks shows that the proposed method achieves significantly better constraint satisfaction performance and better sample efficiency than baselines. The code is available at https://github.com/liuzuxin/cvpo-safe-rl.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22b/liu22b.pdf", "supp": "", "pdf_size": 2167622, "gs_citation": 101, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13833315390800713597&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Carnegie Mellon University; Carnegie Mellon University; Nuro Inc.; Nuro Inc.; Carnegie Mellon University; University of Illinois Urbana-Champaign; Carnegie Mellon University", "aff_domain": "cmu.edu; ; ; ; ; ;cmu.edu", "email": "cmu.edu; ; ; ; ; ;cmu.edu", "github": "https://github.com/liuzuxin/cvpo-safe-rl", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/liu22b.html", "aff_unique_index": "0;0;1;1;0;2;0", "aff_unique_norm": "Carnegie Mellon University;Nuro Inc.;University of Illinois Urbana-Champaign", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.nuro.ai;https://illinois.edu", "aff_unique_abbr": "CMU;Nuro;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Constraint-based graph network simulator", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17851", "id": "17851", "proceeding": "https://proceedings.mlr.press/v162/rubanova22a.html", "poster": "", "slides": "", "author_site": "Yulia Rubanova, Alvaro Sanchez-Gonzalez, Tobias Pfaff, Peter Battaglia", "author": "Yulia Rubanova; Alvaro Sanchez-Gonzalez; Tobias Pfaff; Peter Battaglia", "abstract": "In the area of physical simulations, nearly all neural-network-based methods directly predict future states from the input states. However, many traditional simulation engines instead model the constraints of the system and select the state which satisfies them. Here we present a framework for constraint-based learned simulation, where a scalar constraint function is implemented as a graph neural network, and future predictions are computed by solving the optimization problem defined by the learned constraint. Our model achieves comparable or better accuracy to top learned simulators on a variety of challenging physical domains, and offers several unique advantages. We can improve the simulation accuracy on a larger system by applying more solver iterations at test time. We also can incorporate novel hand-designed constraints at test time and simulate new dynamics which were not present in the training data. Our constraint-based framework shows how key techniques from traditional simulation and numerical methods can be leveraged as inductive biases in machine learning simulators.", "bibtex": "@InProceedings{pmlr-v162-rubanova22a,\n title = \t {Constraint-based graph network simulator},\n author = {Rubanova, Yulia and Sanchez-Gonzalez, Alvaro and Pfaff, Tobias and Battaglia, Peter},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18844--18870},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rubanova22a/rubanova22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rubanova22a.html},\n abstract = \t {In the area of physical simulations, nearly all neural-network-based methods directly predict future states from the input states. However, many traditional simulation engines instead model the constraints of the system and select the state which satisfies them. Here we present a framework for constraint-based learned simulation, where a scalar constraint function is implemented as a graph neural network, and future predictions are computed by solving the optimization problem defined by the learned constraint. Our model achieves comparable or better accuracy to top learned simulators on a variety of challenging physical domains, and offers several unique advantages. We can improve the simulation accuracy on a larger system by applying more solver iterations at test time. We also can incorporate novel hand-designed constraints at test time and simulate new dynamics which were not present in the training data. Our constraint-based framework shows how key techniques from traditional simulation and numerical methods can be leveraged as inductive biases in machine learning simulators.}\n}", "pdf": "https://proceedings.mlr.press/v162/rubanova22a/rubanova22a.pdf", "supp": "", "pdf_size": 15775894, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1543304278279933928&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK", "aff_domain": "deepmind.com;deepmind.com; ; ", "email": "deepmind.com;deepmind.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/rubanova22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Content Addressable Memory Without Catastrophic Forgetting by Heteroassociation with a Fixed Scaffold", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18271", "id": "18271", "proceeding": "https://proceedings.mlr.press/v162/sharma22b.html", "poster": "/media/PosterPDFs/ICML%202022/d6539d3b57159babf6a72e106beb45bd_LHsA93q.png?t=1657894115.0814779", "slides": "/media/icml-2022/Slides/18271.pdf", "author_site": "Sugandha Sharma, Sarthak Chandra, Ila R. Fiete", "author": "Sugandha Sharma; Sarthak Chandra; Ila Fiete", "abstract": "Content-addressable memory (CAM) networks, so-called because stored items can be recalled by partial or corrupted versions of the items, exhibit near-perfect recall of a small number of information-dense patterns below capacity and a \u2019memory cliff\u2019 beyond, such that inserting a single additional pattern results in catastrophic loss of all stored patterns. We propose a novel CAM architecture, Memory Scaffold with Heteroassociation (MESH), that factorizes the problems of internal attractor dynamics and association with external content to generate a CAM continuum without a memory cliff: Small numbers of patterns are stored with complete information recovery matching standard CAMs, while inserting more patterns still results in partial recall of every pattern, with a graceful trade-off between pattern number and pattern richness. Motivated by the architecture of the Entorhinal-Hippocampal memory circuit in the brain, MESH is a tripartite architecture with pairwise interactions that uses a predetermined set of internally stabilized states together with heteroassociation between the internal states and arbitrary external patterns. We show analytically and experimentally that for any number of stored patterns, MESH nearly saturates the total information bound (given by the number of synapses) for CAM networks, outperforming all existing CAM models.", "bibtex": "@InProceedings{pmlr-v162-sharma22b,\n title = \t {Content Addressable Memory Without Catastrophic Forgetting by Heteroassociation with a Fixed Scaffold},\n author = {Sharma, Sugandha and Chandra, Sarthak and Fiete, Ila},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19658--19682},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sharma22b/sharma22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/sharma22b.html},\n abstract = \t {Content-addressable memory (CAM) networks, so-called because stored items can be recalled by partial or corrupted versions of the items, exhibit near-perfect recall of a small number of information-dense patterns below capacity and a \u2019memory cliff\u2019 beyond, such that inserting a single additional pattern results in catastrophic loss of all stored patterns. We propose a novel CAM architecture, Memory Scaffold with Heteroassociation (MESH), that factorizes the problems of internal attractor dynamics and association with external content to generate a CAM continuum without a memory cliff: Small numbers of patterns are stored with complete information recovery matching standard CAMs, while inserting more patterns still results in partial recall of every pattern, with a graceful trade-off between pattern number and pattern richness. Motivated by the architecture of the Entorhinal-Hippocampal memory circuit in the brain, MESH is a tripartite architecture with pairwise interactions that uses a predetermined set of internally stabilized states together with heteroassociation between the internal states and arbitrary external patterns. We show analytically and experimentally that for any number of stored patterns, MESH nearly saturates the total information bound (given by the number of synapses) for CAM networks, outperforming all existing CAM models.}\n}", "pdf": "https://proceedings.mlr.press/v162/sharma22b/sharma22b.pdf", "supp": "", "pdf_size": 4275605, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16874084475877050820&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff": "Department of Brain and Cognitive Sciences, McGovern Institute for Brain Research, & Integrative Computational Neuroscience Center (ICoN), Massachusetts Institute of Technology, Cambridge, USA; Department of Brain and Cognitive Sciences, McGovern Institute for Brain Research, & Integrative Computational Neuroscience Center (ICoN), Massachusetts Institute of Technology, Cambridge, USA; Department of Brain and Cognitive Sciences, McGovern Institute for Brain Research, & Integrative Computational Neuroscience Center (ICoN), Massachusetts Institute of Technology, Cambridge, USA", "aff_domain": "mit.edu; ; ", "email": "mit.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sharma22b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Department of Brain and Cognitive Sciences", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "ContentVec: An Improved Self-Supervised Speech Representation by Disentangling Speakers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18359", "id": "18359", "proceeding": "https://proceedings.mlr.press/v162/qian22b.html", "poster": "/media/PosterPDFs/ICML%202022/4ff3e350028d0cfcb92c3a87a57585b1.png?t=1658079137.6128454", "slides": "", "author_site": "Kaizhi Qian, Yang Zhang, Heting Gao, Junrui Ni, Cheng-I Lai, David Cox, Mark Hasegawa-Johnson, Shiyu Chang", "author": "Kaizhi Qian; Yang Zhang; Heting Gao; Junrui Ni; Cheng-I Lai; David Cox; Mark Hasegawa-Johnson; Shiyu Chang", "abstract": "Self-supervised learning in speech involves training a speech representation network on a large-scale unannotated speech corpus, and then applying the learned representations to downstream tasks. Since the majority of the downstream tasks of SSL learning in speech largely focus on the content information in speech, the most desirable speech representations should be able to disentangle unwanted variations, such as speaker variations, from the content. However, disentangling speakers is very challenging, because removing the speaker information could easily result in a loss of content as well, and the damage of the latter usually far outweighs the benefit of the former. In this paper, we propose a new SSL method that can achieve speaker disentanglement without severe loss of content. Our approach is adapted from the HuBERT framework, and incorporates disentangling mechanisms to regularize both the teacher labels and the learned representations. We evaluate the benefit of speaker disentanglement on a set of content-related downstream tasks, and observe a consistent and notable performance advantage of our speaker-disentangled representations.", "bibtex": "@InProceedings{pmlr-v162-qian22b,\n title = \t {{C}ontent{V}ec: An Improved Self-Supervised Speech Representation by Disentangling Speakers},\n author = {Qian, Kaizhi and Zhang, Yang and Gao, Heting and Ni, Junrui and Lai, Cheng-I and Cox, David and Hasegawa-Johnson, Mark and Chang, Shiyu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18003--18017},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qian22b/qian22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/qian22b.html},\n abstract = \t {Self-supervised learning in speech involves training a speech representation network on a large-scale unannotated speech corpus, and then applying the learned representations to downstream tasks. Since the majority of the downstream tasks of SSL learning in speech largely focus on the content information in speech, the most desirable speech representations should be able to disentangle unwanted variations, such as speaker variations, from the content. However, disentangling speakers is very challenging, because removing the speaker information could easily result in a loss of content as well, and the damage of the latter usually far outweighs the benefit of the former. In this paper, we propose a new SSL method that can achieve speaker disentanglement without severe loss of content. Our approach is adapted from the HuBERT framework, and incorporates disentangling mechanisms to regularize both the teacher labels and the learned representations. We evaluate the benefit of speaker disentanglement on a set of content-related downstream tasks, and observe a consistent and notable performance advantage of our speaker-disentangled representations.}\n}", "pdf": "https://proceedings.mlr.press/v162/qian22b/qian22b.pdf", "supp": "", "pdf_size": 722289, "gs_citation": 138, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16442143470536354603&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "MIT-IBM Watson AI Lab; MIT-IBM Watson AI Lab; University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign; Massachusetts Institute of Technology; MIT-IBM Watson AI Lab; University of Illinois at Urbana-Champaign; University of California, Santa Barbara", "aff_domain": "ibm.com;ibm.com; ; ; ; ; ; ", "email": "ibm.com;ibm.com; ; ; ; ; ; ", "github": "https://github.com/auspicious3000/contentvec", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/qian22b.html", "aff_unique_index": "0;0;1;1;0;0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;University of Illinois Urbana-Champaign;University of California, Santa Barbara", "aff_unique_dep": "IBM Watson AI Lab;;", "aff_unique_url": "https://www.mitibmwatsonailab.org;https://illinois.edu;https://www.ucsb.edu", "aff_unique_abbr": "MIT-IBM AI Lab;UIUC;UCSB", "aff_campus_unique_index": "1;1;1;2", "aff_campus_unique": ";Urbana-Champaign;Santa Barbara", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Context-Aware Drift Detection", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18017", "id": "18017", "proceeding": "https://proceedings.mlr.press/v162/cobb22a.html", "poster": "/media/PosterPDFs/ICML%202022/2cfa47a65809ea0496bbf9aa363dc5da.png?t=1657525090.7260122", "slides": "", "author_site": "Oliver Cobb, Arnaud Van Looveren", "author": "Oliver Cobb; Arnaud Van Looveren", "abstract": "When monitoring machine learning systems, two-sample tests of homogeneity form the foundation upon which existing approaches to drift detection build. They are used to test for evidence that the distribution underlying recent deployment data differs from that underlying the historical reference data. Often, however, various factors such as time-induced correlation mean that batches of recent deployment data are not expected to form an i.i.d. sample from the historical data distribution. Instead we may wish to test for differences in the distributions conditional on", "bibtex": "@InProceedings{pmlr-v162-cobb22a,\n title = \t {Context-Aware Drift Detection},\n author = {Cobb, Oliver and Van Looveren, Arnaud},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4087--4111},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cobb22a/cobb22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/cobb22a.html},\n abstract = \t {When monitoring machine learning systems, two-sample tests of homogeneity form the foundation upon which existing approaches to drift detection build. They are used to test for evidence that the distribution underlying recent deployment data differs from that underlying the historical reference data. Often, however, various factors such as time-induced correlation mean that batches of recent deployment data are not expected to form an i.i.d. sample from the historical data distribution. Instead we may wish to test for differences in the distributions conditional on", "pdf": "https://proceedings.mlr.press/v162/cobb22a/cobb22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/cobb22a-supp.zip", "pdf_size": 2045179, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9993193813631773645&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Seldon Technologies; Seldon Technologies", "aff_domain": "seldon.io; ", "email": "seldon.io; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/cobb22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Seldon Technologies", "aff_unique_dep": "", "aff_unique_url": "https://www.seldon.io", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Contextual Bandits with Large Action Spaces: Made Practical", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16993", "id": "16993", "proceeding": "https://proceedings.mlr.press/v162/zhu22b.html", "poster": "/media/PosterPDFs/ICML%202022/f58c9875ac84dfe1fbe91b918773d050.png?t=1657917763.7941995", "slides": "", "author_site": "Yinglun Zhu, Dylan Foster, John Langford, Paul Mineiro", "author": "Yinglun Zhu; Dylan J Foster; John Langford; Paul Mineiro", "abstract": "A central problem in sequential decision making is to develop algorithms that are practical and computationally efficient, yet support the use of flexible, general-purpose models. Focusing on the contextual bandit problem, recent progress provides provably efficient algorithms with strong empirical performance when the number of possible alternatives (\u201cactions\u201d) is small, but guarantees for decision making in large, continuous action spaces have remained elusive, leading to a significant gap between theory and practice. We present the first efficient, general-purpose algorithm for contextual bandits with continuous, linearly structured action spaces. Our algorithm makes use of computational oracles for (i) supervised learning, and (ii) optimization over the action space, and achieves sample complexity, runtime, and memory independent of the size of the action space. In addition, it is simple and practical. We perform a large-scale empirical evaluation, and show that our approach typically enjoys superior performance and efficiency compared to standard baselines.", "bibtex": "@InProceedings{pmlr-v162-zhu22b,\n title = \t {Contextual Bandits with Large Action Spaces: Made Practical},\n author = {Zhu, Yinglun and Foster, Dylan J and Langford, John and Mineiro, Paul},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27428--27453},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhu22b/zhu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhu22b.html},\n abstract = \t {A central problem in sequential decision making is to develop algorithms that are practical and computationally efficient, yet support the use of flexible, general-purpose models. Focusing on the contextual bandit problem, recent progress provides provably efficient algorithms with strong empirical performance when the number of possible alternatives (\u201cactions\u201d) is small, but guarantees for decision making in large, continuous action spaces have remained elusive, leading to a significant gap between theory and practice. We present the first efficient, general-purpose algorithm for contextual bandits with continuous, linearly structured action spaces. Our algorithm makes use of computational oracles for (i) supervised learning, and (ii) optimization over the action space, and achieves sample complexity, runtime, and memory independent of the size of the action space. In addition, it is simple and practical. We perform a large-scale empirical evaluation, and show that our approach typically enjoys superior performance and efficiency compared to standard baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhu22b/zhu22b.pdf", "supp": "", "pdf_size": 501971, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5763648014002570810&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "University of Wisconsin-Madison; Microsoft Research NYC; Microsoft Research NYC; Microsoft Research NYC", "aff_domain": "cs.wisc.edu; ; ; ", "email": "cs.wisc.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhu22b.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Wisconsin-Madison;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.wisc.edu;https://www.microsoft.com/en-us/research/group/microsoft-research-new-york-city", "aff_unique_abbr": "UW-Madison;MSR NYC", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Madison;New York City", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Contextual Bandits with Smooth Regret: Efficient Learning in Continuous Action Spaces", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16947", "id": "16947", "proceeding": "https://proceedings.mlr.press/v162/zhu22h.html", "poster": "/media/PosterPDFs/ICML%202022/e7d161ac8d8a76529d39d9f5b4249ccb_wXVmO1q.png?t=1657917994.7233772", "slides": "", "author_site": "Yinglun Zhu, Paul Mineiro", "author": "Yinglun Zhu; Paul Mineiro", "abstract": "Designing efficient general-purpose contextual bandit algorithms that work with large\u2014or even infinite\u2014action spaces would facilitate application to important scenarios such as information retrieval, recommendation systems, and continuous control. While obtaining standard regret guarantees can be hopeless, alternative regret notions have been proposed to tackle the large action setting. We propose a smooth regret notion for contextual bandits, which dominates previously proposed alternatives. We design a statistically and computationally efficient algorithm\u2014for the proposed smooth regret\u2014that works with general function approximation under standard supervised oracles. We also present an adaptive algorithm that automatically adapts to any smoothness level. Our algorithms can be used to recover the previous minimax/Pareto optimal guarantees under the standard regret, e.g., in bandit problems with multiple best arms and Lipschitz/H{\u00f6}lder bandits. We conduct large-scale empirical evaluations demonstrating the efficacy of our proposed algorithms.", "bibtex": "@InProceedings{pmlr-v162-zhu22h,\n title = \t {Contextual Bandits with Smooth Regret: Efficient Learning in Continuous Action Spaces},\n author = {Zhu, Yinglun and Mineiro, Paul},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27574--27590},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhu22h/zhu22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhu22h.html},\n abstract = \t {Designing efficient general-purpose contextual bandit algorithms that work with large\u2014or even infinite\u2014action spaces would facilitate application to important scenarios such as information retrieval, recommendation systems, and continuous control. While obtaining standard regret guarantees can be hopeless, alternative regret notions have been proposed to tackle the large action setting. We propose a smooth regret notion for contextual bandits, which dominates previously proposed alternatives. We design a statistically and computationally efficient algorithm\u2014for the proposed smooth regret\u2014that works with general function approximation under standard supervised oracles. We also present an adaptive algorithm that automatically adapts to any smoothness level. Our algorithms can be used to recover the previous minimax/Pareto optimal guarantees under the standard regret, e.g., in bandit problems with multiple best arms and Lipschitz/H{\u00f6}lder bandits. We conduct large-scale empirical evaluations demonstrating the efficacy of our proposed algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhu22h/zhu22h.pdf", "supp": "", "pdf_size": 456726, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2237234303144765537&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "University of Wisconsin-Madison; Microsoft Research NYC", "aff_domain": "cs.wisc.edu; ", "email": "cs.wisc.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/zhu22h.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of Wisconsin-Madison;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.wisc.edu;https://www.microsoft.com/en-us/research/group/microsoft-research-new-york-city", "aff_unique_abbr": "UW-Madison;MSR NYC", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Madison;New York City", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Contextual Information-Directed Sampling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16423", "id": "16423", "proceeding": "https://proceedings.mlr.press/v162/hao22b.html", "poster": "/media/PosterPDFs/ICML%202022/7fa732b517cbed14a48843d74526c11a.png?t=1657840329.7748337", "slides": "", "author_site": "Botao Hao, Tor Lattimore, Chao Qin", "author": "Botao Hao; Tor Lattimore; Chao Qin", "abstract": "Information-directed sampling (IDS) has recently demonstrated its potential as a data-efficient reinforcement learning algorithm. However, it is still unclear what is the right form of information ratio to optimize when contextual information is available. We investigate the IDS design through two contextual bandit problems: contextual bandits with graph feedback and sparse linear contextual bandits. We provably demonstrate the advantage of", "bibtex": "@InProceedings{pmlr-v162-hao22b,\n title = \t {Contextual Information-Directed Sampling},\n author = {Hao, Botao and Lattimore, Tor and Qin, Chao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8446--8464},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hao22b/hao22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/hao22b.html},\n abstract = \t {Information-directed sampling (IDS) has recently demonstrated its potential as a data-efficient reinforcement learning algorithm. However, it is still unclear what is the right form of information ratio to optimize when contextual information is available. We investigate the IDS design through two contextual bandit problems: contextual bandits with graph feedback and sparse linear contextual bandits. We provably demonstrate the advantage of", "pdf": "https://proceedings.mlr.press/v162/hao22b/hao22b.pdf", "supp": "", "pdf_size": 429877, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3123017157445721994&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Deepmind; Deepmind; Columbia University", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/hao22b.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "DeepMind;Columbia University", "aff_unique_dep": ";", "aff_unique_url": "https://deepmind.com;https://www.columbia.edu", "aff_unique_abbr": "DeepMind;Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Continual Learning via Sequential Function-Space Variational Inference", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18269", "id": "18269", "proceeding": "https://proceedings.mlr.press/v162/rudner22a.html", "poster": "", "slides": "", "author_site": "Tim G. J Rudner, Freddie Bickford Smith, QIXUAN FENG, Yee-Whye Teh, Yarin Gal", "author": "Tim G. J. Rudner; Freddie Bickford Smith; Qixuan Feng; Yee Whye Teh; Yarin Gal", "abstract": "Sequential Bayesian inference over predictive functions is a natural framework for continual learning from streams of data. However, applying it to neural networks has proved challenging in practice. Addressing the drawbacks of existing techniques, we propose an optimization objective derived by formulating continual learning as sequential function-space variational inference. In contrast to existing methods that regularize neural network parameters directly, this objective allows parameters to vary widely during training, enabling better adaptation to new tasks. Compared to objectives that directly regularize neural network predictions, the proposed objective allows for more flexible variational distributions and more effective regularization. We demonstrate that, across a range of task sequences, neural networks trained via sequential function-space variational inference achieve better predictive accuracy than networks trained with related methods while depending less on maintaining a set of representative points from previous tasks.", "bibtex": "@InProceedings{pmlr-v162-rudner22a,\n title = \t {Continual Learning via Sequential Function-Space Variational Inference},\n author = {Rudner, Tim G. J. and Bickford Smith, Freddie and Feng, Qixuan and Teh, Yee Whye and Gal, Yarin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18871--18887},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rudner22a/rudner22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rudner22a.html},\n abstract = \t {Sequential Bayesian inference over predictive functions is a natural framework for continual learning from streams of data. However, applying it to neural networks has proved challenging in practice. Addressing the drawbacks of existing techniques, we propose an optimization objective derived by formulating continual learning as sequential function-space variational inference. In contrast to existing methods that regularize neural network parameters directly, this objective allows parameters to vary widely during training, enabling better adaptation to new tasks. Compared to objectives that directly regularize neural network predictions, the proposed objective allows for more flexible variational distributions and more effective regularization. We demonstrate that, across a range of task sequences, neural networks trained via sequential function-space variational inference achieve better predictive accuracy than networks trained with related methods while depending less on maintaining a set of representative points from previous tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/rudner22a/rudner22a.pdf", "supp": "", "pdf_size": 1153044, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4307301937228622596&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": "University of Oxford; University of Oxford; University of Oxford; University of Oxford; University of Oxford", "aff_domain": "cs.ox.ac.uk; ; ; ; ", "email": "cs.ox.ac.uk; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/rudner22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Continual Learning with Guarantees via Weight Interval Constraints", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17639", "id": "17639", "proceeding": "https://proceedings.mlr.press/v162/wolczyk22a.html", "poster": "", "slides": "", "author_site": "Maciej Wo\u0142czyk, Karol J. Piczak, Bartosz W\u00f3jcik, \u0141ukasz Pustelnik, Pawe\u0142 Morawiecki, Jacek Tabor, Tomasz Trzcinski, Przemys\u0142aw Spurek", "author": "Maciej Wo\u0142czyk; Karol Piczak; Bartosz W\u00f3jcik; Lukasz Pustelnik; Pawe\u0142 Morawiecki; Jacek Tabor; Tomasz Trzcinski; Przemys\u0142aw Spurek", "abstract": "We introduce a new training paradigm that enforces interval constraints on neural network parameter space to control forgetting. Contemporary Continual Learning (CL) methods focus on training neural networks efficiently from a stream of data, while reducing the negative impact of catastrophic forgetting, yet they do not provide any firm guarantees that network performance will not deteriorate uncontrollably over time. In this work, we show how to put bounds on forgetting by reformulating continual learning of a model as a continual contraction of its parameter space. To that end, we propose Hyperrectangle Training, a new training methodology where each task is represented by a hyperrectangle in the parameter space, fully contained in the hyperrectangles of the previous tasks. This formulation reduces the NP-hard CL problem back to polynomial time while providing full resilience against forgetting. We validate our claim by developing InterContiNet (Interval Continual Learning) algorithm which leverages interval arithmetic to effectively model parameter regions as hyperrectangles. Through experimental results, we show that our approach performs well in a continual learning setup without storing data from previous tasks.", "bibtex": "@InProceedings{pmlr-v162-wolczyk22a,\n title = \t {Continual Learning with Guarantees via Weight Interval Constraints},\n author = {Wo{\\l}czyk, Maciej and Piczak, Karol and W{\\'o}jcik, Bartosz and Pustelnik, Lukasz and Morawiecki, Pawe{\\l} and Tabor, Jacek and Trzcinski, Tomasz and Spurek, Przemys{\\l}aw},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23897--23911},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wolczyk22a/wolczyk22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/wolczyk22a.html},\n abstract = \t {We introduce a new training paradigm that enforces interval constraints on neural network parameter space to control forgetting. Contemporary Continual Learning (CL) methods focus on training neural networks efficiently from a stream of data, while reducing the negative impact of catastrophic forgetting, yet they do not provide any firm guarantees that network performance will not deteriorate uncontrollably over time. In this work, we show how to put bounds on forgetting by reformulating continual learning of a model as a continual contraction of its parameter space. To that end, we propose Hyperrectangle Training, a new training methodology where each task is represented by a hyperrectangle in the parameter space, fully contained in the hyperrectangles of the previous tasks. This formulation reduces the NP-hard CL problem back to polynomial time while providing full resilience against forgetting. We validate our claim by developing InterContiNet (Interval Continual Learning) algorithm which leverages interval arithmetic to effectively model parameter regions as hyperrectangles. Through experimental results, we show that our approach performs well in a continual learning setup without storing data from previous tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/wolczyk22a/wolczyk22a.pdf", "supp": "", "pdf_size": 1400165, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12644818321484154250&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "email": ";;;;;;;", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/wolczyk22a.html" }, { "title": "Continual Repeated Annealed Flow Transport Monte Carlo", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17565", "id": "17565", "proceeding": "https://proceedings.mlr.press/v162/matthews22a.html", "poster": "", "slides": "", "author_site": "Alexander Matthews, Michael Arbel, Danilo J. Rezende, Arnaud Doucet", "author": "Alex Matthews; Michael Arbel; Danilo Jimenez Rezende; Arnaud Doucet", "abstract": "We propose Continual Repeated Annealed Flow Transport Monte Carlo (CRAFT), a method that combines a sequential Monte Carlo (SMC) sampler (itself a generalization of Annealed Importance Sampling) with variational inference using normalizing flows. The normalizing flows are directly trained to transport between annealing temperatures using a KL divergence for each transition. This optimization objective is itself estimated using the normalizing flow/SMC approximation. We show conceptually and using multiple empirical examples that CRAFT improves on Annealed Flow Transport Monte Carlo (Arbel et al., 2021), on which it builds and also on Markov chain Monte Carlo (MCMC) based Stochastic Normalizing Flows (Wu et al., 2020). By incorporating CRAFT within particle MCMC, we show that such learnt samplers can achieve impressively accurate results on a challenging lattice field theory example.", "bibtex": "@InProceedings{pmlr-v162-matthews22a,\n title = \t {Continual Repeated Annealed Flow Transport {M}onte {C}arlo},\n author = {Matthews, Alex and Arbel, Michael and Rezende, Danilo Jimenez and Doucet, Arnaud},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15196--15219},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/matthews22a/matthews22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/matthews22a.html},\n abstract = \t {We propose Continual Repeated Annealed Flow Transport Monte Carlo (CRAFT), a method that combines a sequential Monte Carlo (SMC) sampler (itself a generalization of Annealed Importance Sampling) with variational inference using normalizing flows. The normalizing flows are directly trained to transport between annealing temperatures using a KL divergence for each transition. This optimization objective is itself estimated using the normalizing flow/SMC approximation. We show conceptually and using multiple empirical examples that CRAFT improves on Annealed Flow Transport Monte Carlo (Arbel et al., 2021), on which it builds and also on Markov chain Monte Carlo (MCMC) based Stochastic Normalizing Flows (Wu et al., 2020). By incorporating CRAFT within particle MCMC, we show that such learnt samplers can achieve impressively accurate results on a challenging lattice field theory example.}\n}", "pdf": "https://proceedings.mlr.press/v162/matthews22a/matthews22a.pdf", "supp": "", "pdf_size": 749970, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15272534120760724190&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "DeepMind; Universit\u00e9 Grenoble Alpes, Inria, CNRS; DeepMind; DeepMind", "aff_domain": "google.com; ;google.com;google.com", "email": "google.com; ;google.com;google.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/matthews22a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "DeepMind;Universit\u00e9 Grenoble Alpes", "aff_unique_dep": ";", "aff_unique_url": "https://deepmind.com;https://www.univ-grenoble-alpes.fr", "aff_unique_abbr": "DeepMind;UGA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United Kingdom;France" }, { "title": "Continuous Control with Action Quantization from Demonstrations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16077", "id": "16077", "proceeding": "https://proceedings.mlr.press/v162/dadashi22a.html", "poster": "/media/PosterPDFs/ICML%202022/30c8e1ca872524fbf7ea5c519ca397ee.png?t=1658073452.2454617", "slides": "", "author_site": "Robert Dadashi, L\u00e9onard Hussenot, Damien Vincent, Sertan Girgin, Anton Raichuk, Matthieu Geist, Olivier Pietquin", "author": "Robert Dadashi; L\u00e9onard Hussenot; Damien Vincent; Sertan Girgin; Anton Raichuk; Matthieu Geist; Olivier Pietquin", "abstract": "In this paper, we propose a novel Reinforcement Learning (RL) framework for problems with continuous action spaces: Action Quantization from Demonstrations (AQuaDem). The proposed approach consists in learning a discretization of continuous action spaces from human demonstrations. This discretization returns a set of plausible actions (in light of the demonstrations) for each input state, thus capturing the priors of the demonstrator and their multimodal behavior. By discretizing the action space, any discrete action deep RL technique can be readily applied to the continuous control problem. Experiments show that the proposed approach outperforms state-of-the-art methods such as SAC in the RL setup, and GAIL in the Imitation Learning setup. We provide a website with interactive videos: https://google-research.github.io/aquadem/ and make the code available: https://github.com/google-research/google-research/tree/master/aquadem.", "bibtex": "@InProceedings{pmlr-v162-dadashi22a,\n title = \t {Continuous Control with Action Quantization from Demonstrations},\n author = {Dadashi, Robert and Hussenot, L{\\'e}onard and Vincent, Damien and Girgin, Sertan and Raichuk, Anton and Geist, Matthieu and Pietquin, Olivier},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4537--4557},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dadashi22a/dadashi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dadashi22a.html},\n abstract = \t {In this paper, we propose a novel Reinforcement Learning (RL) framework for problems with continuous action spaces: Action Quantization from Demonstrations (AQuaDem). The proposed approach consists in learning a discretization of continuous action spaces from human demonstrations. This discretization returns a set of plausible actions (in light of the demonstrations) for each input state, thus capturing the priors of the demonstrator and their multimodal behavior. By discretizing the action space, any discrete action deep RL technique can be readily applied to the continuous control problem. Experiments show that the proposed approach outperforms state-of-the-art methods such as SAC in the RL setup, and GAIL in the Imitation Learning setup. We provide a website with interactive videos: https://google-research.github.io/aquadem/ and make the code available: https://github.com/google-research/google-research/tree/master/aquadem.}\n}", "pdf": "https://proceedings.mlr.press/v162/dadashi22a/dadashi22a.pdf", "supp": "", "pdf_size": 7929783, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18354958382752460493&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Google Research, Brain Team; Google Research, Brain Team + Univ. de Lille, CNRS, Inria Scool, UMR 9189 CRIStAL; Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team", "aff_domain": "google.com; ; ; ; ; ; ", "email": "google.com; ; ; ; ; ; ", "github": "https://github.com/google-research/google-research/tree/master/aquadem", "project": "https://google-research.github.io/aquadem/", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/dadashi22a.html", "aff_unique_index": "0;0+1;0;0;0;0;0", "aff_unique_norm": "Google;University of Lille", "aff_unique_dep": "Google Research;Inria Scool, UMR 9189 CRIStAL", "aff_unique_url": "https://research.google;https://www.univ-lille.fr", "aff_unique_abbr": "Google;Univ. de Lille", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0+1;0;0;0;0;0", "aff_country_unique": "United States;France" }, { "title": "Continuous-Time Analysis of Accelerated Gradient Methods via Conservation Laws in Dilated Coordinate Systems", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17997", "id": "17997", "proceeding": "https://proceedings.mlr.press/v162/suh22a.html", "poster": "", "slides": "", "author_site": "Jaewook Suh, Gyumin Roh, Ernest Ryu", "author": "Jaewook J Suh; Gyumin Roh; Ernest K Ryu", "abstract": "We analyze continuous-time models of accelerated gradient methods through deriving conservation laws in dilated coordinate systems. Namely, instead of analyzing the dynamics of $X(t)$, we analyze the dynamics of $W(t)=t^\\alpha(X(t)-X_c)$ for some $\\alpha$ and $X_c$ and derive a conserved quantity, analogous to physical energy, in this dilated coordinate system. Through this methodology, we recover many known continuous-time analyses in a streamlined manner and obtain novel continuous-time analyses for OGM-G, an acceleration mechanism for efficiently reducing gradient magnitude that is distinct from that of Nesterov. Finally, we show that a semi-second-order symplectic Euler discretization in the dilated coordinate system leads to an $\\mathcal{O}(1/k^2)$ rate on the standard setup of smooth convex minimization, without any further assumptions such as infinite differentiability.", "bibtex": "@InProceedings{pmlr-v162-suh22a,\n title = \t {Continuous-Time Analysis of Accelerated Gradient Methods via Conservation Laws in Dilated Coordinate Systems},\n author = {Suh, Jaewook J and Roh, Gyumin and Ryu, Ernest K},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20640--20667},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/suh22a/suh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/suh22a.html},\n abstract = \t {We analyze continuous-time models of accelerated gradient methods through deriving conservation laws in dilated coordinate systems. Namely, instead of analyzing the dynamics of $X(t)$, we analyze the dynamics of $W(t)=t^\\alpha(X(t)-X_c)$ for some $\\alpha$ and $X_c$ and derive a conserved quantity, analogous to physical energy, in this dilated coordinate system. Through this methodology, we recover many known continuous-time analyses in a streamlined manner and obtain novel continuous-time analyses for OGM-G, an acceleration mechanism for efficiently reducing gradient magnitude that is distinct from that of Nesterov. Finally, we show that a semi-second-order symplectic Euler discretization in the dilated coordinate system leads to an $\\mathcal{O}(1/k^2)$ rate on the standard setup of smooth convex minimization, without any further assumptions such as infinite differentiability.}\n}", "pdf": "https://proceedings.mlr.press/v162/suh22a/suh22a.pdf", "supp": "", "pdf_size": 377160, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12425314558748979147&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Mathematical Sciences, Seoul National University, Seoul, Korea; Department of Mathematical Sciences, Seoul National University, Seoul, Korea; Department of Mathematical Sciences, Seoul National University, Seoul, Korea", "aff_domain": "snu.ac.kr; ; ", "email": "snu.ac.kr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/suh22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "Department of Mathematical Sciences", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Continuous-Time Modeling of Counterfactual Outcomes Using Neural Controlled Differential Equations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17111", "id": "17111", "proceeding": "https://proceedings.mlr.press/v162/seedat22b.html", "poster": "/media/PosterPDFs/ICML%202022/be3087e74e9100d4bc4c6268cdbe8456.png?t=1658159050.8739452", "slides": "", "author_site": "Nabeel Seedat, Fergus Imrie, Alexis Bellot, Zhaozhi Qian, Mihaela van der Schaar", "author": "Nabeel Seedat; Fergus Imrie; Alexis Bellot; Zhaozhi Qian; Mihaela van der Schaar", "abstract": "Estimating counterfactual outcomes over time has the potential to unlock personalized healthcare by assisting decision-makers to answer \"what-if\" questions. Existing causal inference approaches typically consider regular, discrete-time intervals between observations and treatment decisions and hence are unable to naturally model irregularly sampled data, which is the common setting in practice. To handle arbitrary observation patterns, we interpret the data as samples from an underlying continuous-time process and propose to model its latent trajectory explicitly using the mathematics of controlled differential equations. This leads to a new approach, the Treatment Effect Neural Controlled Differential Equation (TE-CDE), that allows the potential outcomes to be evaluated at any time point. In addition, adversarial training is used to adjust for time-dependent confounding which is critical in longitudinal settings and is an added challenge not encountered in conventional time series. To assess solutions to this problem, we propose a controllable simulation environment based on a model of tumor growth for a range of scenarios with irregular sampling reflective of a variety of clinical scenarios. TE-CDE consistently outperforms existing approaches in all scenarios with irregular sampling.", "bibtex": "@InProceedings{pmlr-v162-seedat22b,\n title = \t {Continuous-Time Modeling of Counterfactual Outcomes Using Neural Controlled Differential Equations},\n author = {Seedat, Nabeel and Imrie, Fergus and Bellot, Alexis and Qian, Zhaozhi and van der Schaar, Mihaela},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19497--19521},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/seedat22b/seedat22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/seedat22b.html},\n abstract = \t {Estimating counterfactual outcomes over time has the potential to unlock personalized healthcare by assisting decision-makers to answer \"what-if\" questions. Existing causal inference approaches typically consider regular, discrete-time intervals between observations and treatment decisions and hence are unable to naturally model irregularly sampled data, which is the common setting in practice. To handle arbitrary observation patterns, we interpret the data as samples from an underlying continuous-time process and propose to model its latent trajectory explicitly using the mathematics of controlled differential equations. This leads to a new approach, the Treatment Effect Neural Controlled Differential Equation (TE-CDE), that allows the potential outcomes to be evaluated at any time point. In addition, adversarial training is used to adjust for time-dependent confounding which is critical in longitudinal settings and is an added challenge not encountered in conventional time series. To assess solutions to this problem, we propose a controllable simulation environment based on a model of tumor growth for a range of scenarios with irregular sampling reflective of a variety of clinical scenarios. TE-CDE consistently outperforms existing approaches in all scenarios with irregular sampling.}\n}", "pdf": "https://proceedings.mlr.press/v162/seedat22b/seedat22b.pdf", "supp": "", "pdf_size": 8498906, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5364793610407571415&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/seedat22b.html" }, { "title": "Contrastive Learning with Boosted Memorization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17473", "id": "17473", "proceeding": "https://proceedings.mlr.press/v162/zhou22l.html", "poster": "/media/PosterPDFs/ICML%202022/3baa271bc35fe054c86928f7016e8ae6_i7gHx1U.png?t=1657943716.8439794", "slides": "", "author_site": "Zhihan Zhou, Jiangchao Yao, Yan-Feng Wang, Bo Han, Ya Zhang", "author": "Zhihan Zhou; Jiangchao Yao; Yan-Feng Wang; Bo Han; Ya Zhang", "abstract": "Self-supervised learning has achieved a great success in the representation learning of visual and textual data. However, the current methods are mainly validated on the well-curated datasets, which do not exhibit the real-world long-tailed distribution. Recent attempts to consider self-supervised long-tailed learning are made by rebalancing in the loss perspective or the model perspective, resembling the paradigms in the supervised long-tailed learning. Nevertheless, without the aid of labels, these explorations have not shown the expected significant promise due to the limitation in tail sample discovery or the heuristic structure design. Different from previous works, we explore this direction from an alternative perspective, i.e., the data perspective, and propose a novel Boosted Contrastive Learning (BCL) method. Specifically, BCL leverages the memorization effect of deep neural networks to automatically drive the information discrepancy of the sample views in contrastive learning, which is more efficient to enhance the long-tailed learning in the label-unaware context. Extensive experiments on a range of benchmark datasets demonstrate the effectiveness of BCL over several state-of-the-art methods. Our code is available at https://github.com/MediaBrain-SJTU/BCL.", "bibtex": "@InProceedings{pmlr-v162-zhou22l,\n title = \t {Contrastive Learning with Boosted Memorization},\n author = {Zhou, Zhihan and Yao, Jiangchao and Wang, Yan-Feng and Han, Bo and Zhang, Ya},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27367--27377},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22l/zhou22l.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22l.html},\n abstract = \t {Self-supervised learning has achieved a great success in the representation learning of visual and textual data. However, the current methods are mainly validated on the well-curated datasets, which do not exhibit the real-world long-tailed distribution. Recent attempts to consider self-supervised long-tailed learning are made by rebalancing in the loss perspective or the model perspective, resembling the paradigms in the supervised long-tailed learning. Nevertheless, without the aid of labels, these explorations have not shown the expected significant promise due to the limitation in tail sample discovery or the heuristic structure design. Different from previous works, we explore this direction from an alternative perspective, i.e., the data perspective, and propose a novel Boosted Contrastive Learning (BCL) method. Specifically, BCL leverages the memorization effect of deep neural networks to automatically drive the information discrepancy of the sample views in contrastive learning, which is more efficient to enhance the long-tailed learning in the label-unaware context. Extensive experiments on a range of benchmark datasets demonstrate the effectiveness of BCL over several state-of-the-art methods. Our code is available at https://github.com/MediaBrain-SJTU/BCL.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22l/zhou22l.pdf", "supp": "", "pdf_size": 575417, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1426610895759607761&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Cooperative Medianet Innovation Center, Shanghai Jiao Tong University+Shanghai AI Laboratory; Cooperative Medianet Innovation Center, Shanghai Jiao Tong University+Shanghai AI Laboratory; Cooperative Medianet Innovation Center, Shanghai Jiao Tong University+Shanghai AI Laboratory; Department of Computer Science, Hong Kong Baptist University; Cooperative Medianet Innovation Center, Shanghai Jiao Tong University+Shanghai AI Laboratory", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn; ; ;sjtu.edu.cn", "email": "sjtu.edu.cn;sjtu.edu.cn; ; ;sjtu.edu.cn", "github": "https://github.com/MediaBrain-SJTU/BCL", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zhou22l.html", "aff_unique_index": "0+1;0+1;0+1;2;0+1", "aff_unique_norm": "Shanghai Jiao Tong University;Shanghai AI Laboratory;Hong Kong Baptist University", "aff_unique_dep": "Cooperative Medianet Innovation Center;;Department of Computer Science", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.shanghai-ai-lab.com;https://www.hkbu.edu.hk", "aff_unique_abbr": "SJTU;SAIL;HKBU", "aff_campus_unique_index": ";;;1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0+0;0+0;0;0+0", "aff_country_unique": "China" }, { "title": "Contrastive Mixture of Posteriors for Counterfactual Inference, Data Integration and Fairness", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17233", "id": "17233", "proceeding": "https://proceedings.mlr.press/v162/foster22a.html", "poster": "/media/PosterPDFs/ICML%202022/721e7285b298cde5b3d0c973ed8d7b63.png?t=1657202148.7078555", "slides": "", "author_site": "Adam Foster, Arpi Vezer, Craig Glastonbury, P\u00e1id\u00ed Creed, Sam Abujudeh, Aaron Sim", "author": "Adam Foster; Arpi Vezer; Craig A. Glastonbury; Paidi Creed; Samer Abujudeh; Aaron Sim", "abstract": "Learning meaningful representations of data that can address challenges such as batch effect correction and counterfactual inference is a central problem in many domains including computational biology. Adopting a Conditional VAE framework, we show that marginal independence between the representation and a condition variable plays a key role in both of these challenges. We propose the Contrastive Mixture of Posteriors (CoMP) method that uses a novel misalignment penalty defined in terms of mixtures of the variational posteriors to enforce this independence in latent space. We show that CoMP has attractive theoretical properties compared to previous approaches, and we prove counterfactual identifiability of CoMP under additional assumptions. We demonstrate state-of-the-art performance on a set of challenging tasks including aligning human tumour samples with cancer cell-lines, predicting transcriptome-level perturbation responses, and batch correction on single-cell RNA sequencing data. We also find parallels to fair representation learning and demonstrate that CoMP is competitive on a common task in the field.", "bibtex": "@InProceedings{pmlr-v162-foster22a,\n title = \t {Contrastive Mixture of Posteriors for Counterfactual Inference, Data Integration and Fairness},\n author = {Foster, Adam and Vezer, Arpi and Glastonbury, Craig A. and Creed, Paidi and Abujudeh, Samer and Sim, Aaron},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6578--6621},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/foster22a/foster22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/foster22a.html},\n abstract = \t {Learning meaningful representations of data that can address challenges such as batch effect correction and counterfactual inference is a central problem in many domains including computational biology. Adopting a Conditional VAE framework, we show that marginal independence between the representation and a condition variable plays a key role in both of these challenges. We propose the Contrastive Mixture of Posteriors (CoMP) method that uses a novel misalignment penalty defined in terms of mixtures of the variational posteriors to enforce this independence in latent space. We show that CoMP has attractive theoretical properties compared to previous approaches, and we prove counterfactual identifiability of CoMP under additional assumptions. We demonstrate state-of-the-art performance on a set of challenging tasks including aligning human tumour samples with cancer cell-lines, predicting transcriptome-level perturbation responses, and batch correction on single-cell RNA sequencing data. We also find parallels to fair representation learning and demonstrate that CoMP is competitive on a common task in the field.}\n}", "pdf": "https://proceedings.mlr.press/v162/foster22a/foster22a.pdf", "supp": "", "pdf_size": 5615827, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7874050188706328624&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Microsoft Research, Cambridge, UK + BenevolentAI + University of Oxford; BenevolentAI; BenevolentAI + Human Technopole, V.le Rita Levi-Montalcini, 1, 20157 Milano, Italy; BenevolentAI; BenevolentAI; BenevolentAI", "aff_domain": "microsoft.com; ; ; ; ; ", "email": "microsoft.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/foster22a.html", "aff_unique_index": "0+1+2;1;1+3;1;1;1", "aff_unique_norm": "Microsoft;BenevolentAI;University of Oxford;Human Technopole", "aff_unique_dep": "Microsoft Research;;;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.benevolent.ai;https://www.ox.ac.uk;", "aff_unique_abbr": "MSR;BenevolentAI;Oxford;", "aff_campus_unique_index": "0;", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0+0+0;0;0+1;0;0;0", "aff_country_unique": "United Kingdom;Italy" }, { "title": "Contrastive UCB: Provably Efficient Contrastive Self-Supervised Learning in Online Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18337", "id": "18337", "proceeding": "https://proceedings.mlr.press/v162/qiu22c.html", "poster": "/media/PosterPDFs/ICML%202022/f18ee45840e18329939acf1095cdc5a9.png?t=1658337805.842187", "slides": "", "author_site": "Shuang Qiu, Lingxiao Wang, Chenjia Bai, Zhuoran Yang, Zhaoran Wang", "author": "Shuang Qiu; Lingxiao Wang; Chenjia Bai; Zhuoran Yang; Zhaoran Wang", "abstract": "In view of its power in extracting feature representation, contrastive self-supervised learning has been successfully integrated into the practice of (deep) reinforcement learning (RL), leading to efficient policy learning on various applications. Despite its tremendous empirical successes, the understanding of contrastive learning for RL remains elusive. To narrow such a gap, we study contrastive-learning empowered RL for a class of Markov decision processes (MDPs) and Markov games (MGs) with low-rank transitions. For both models, we propose to extract the correct feature representations of the low-rank model by minimizing a contrastive loss. Moreover, under the online setting, we propose novel upper confidence bound (UCB)-type algorithms that incorporate such a contrastive loss with online RL algorithms for MDPs or MGs. We further theoretically prove that our algorithm recovers the true representations and simultaneously achieves sample efficiency in learning the optimal policy and Nash equilibrium in MDPs and MGs. We also provide empirical studies to demonstrate the efficacy of the UCB-based contrastive learning method for RL. To the best of our knowledge, we provide the first provably efficient online RL algorithm that incorporates contrastive learning for representation learning.", "bibtex": "@InProceedings{pmlr-v162-qiu22c,\n title = \t {Contrastive {UCB}: Provably Efficient Contrastive Self-Supervised Learning in Online Reinforcement Learning},\n author = {Qiu, Shuang and Wang, Lingxiao and Bai, Chenjia and Yang, Zhuoran and Wang, Zhaoran},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18168--18210},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qiu22c/qiu22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/qiu22c.html},\n abstract = \t {In view of its power in extracting feature representation, contrastive self-supervised learning has been successfully integrated into the practice of (deep) reinforcement learning (RL), leading to efficient policy learning on various applications. Despite its tremendous empirical successes, the understanding of contrastive learning for RL remains elusive. To narrow such a gap, we study contrastive-learning empowered RL for a class of Markov decision processes (MDPs) and Markov games (MGs) with low-rank transitions. For both models, we propose to extract the correct feature representations of the low-rank model by minimizing a contrastive loss. Moreover, under the online setting, we propose novel upper confidence bound (UCB)-type algorithms that incorporate such a contrastive loss with online RL algorithms for MDPs or MGs. We further theoretically prove that our algorithm recovers the true representations and simultaneously achieves sample efficiency in learning the optimal policy and Nash equilibrium in MDPs and MGs. We also provide empirical studies to demonstrate the efficacy of the UCB-based contrastive learning method for RL. To the best of our knowledge, we provide the first provably efficient online RL algorithm that incorporates contrastive learning for representation learning.}\n}", "pdf": "https://proceedings.mlr.press/v162/qiu22c/qiu22c.pdf", "supp": "", "pdf_size": 1841230, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4487688180752876620&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "University of Chicago; Northwestern University; Shanghai AI Laboratory; Yale University; Northwestern University", "aff_domain": "umich.edu;u.northwestern.edu;gmail.com; ; ", "email": "umich.edu;u.northwestern.edu;gmail.com; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/qiu22c.html", "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "University of Chicago;Northwestern University;Shanghai AI Laboratory;Yale University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uchicago.edu;https://www.northwestern.edu;https://www.shanghai-ai-lab.com;https://www.yale.edu", "aff_unique_abbr": "UChicago;NU;SAIL;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "Controlling Conditional Language Models without Catastrophic Forgetting", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16287", "id": "16287", "proceeding": "https://proceedings.mlr.press/v162/korbak22a.html", "poster": "/media/PosterPDFs/ICML%202022/abc99d6b9938aa86d1f30f8ee0fd169f.png?t=1657565857.5951488", "slides": "", "author_site": "Tomasz Korbak, Hady Elsahar, Germ\u00e1n Kruszewski, Marc Dymetman", "author": "Tomasz Korbak; Hady Elsahar; German Kruszewski; Marc Dymetman", "abstract": "Machine learning is shifting towards general-purpose pretrained generative models, trained in a self-supervised manner on large amounts of data, which can then be applied to solve a large number of tasks. However, due to their generic training methodology, these models often fail to meet some of the downstream requirements (e.g., hallucinations in abstractive summarization or style violations in code generation). This raises the important question of how to adapt pre-trained generative models to meet all requirements without destroying their general capabilities (\"catastrophic forgetting\"). Recent work has proposed to solve this problem by representing task-specific requirements through energy-based models (EBMs) and approximating these EBMs using distributional policy gradients (DPG). Despite its effectiveness, this approach is however limited to unconditional distributions. In this paper, we extend DPG to conditional tasks by proposing Conditional DPG (CDPG). We evaluate CDPG on four different control objectives across three tasks (translation, summarization and code generation) and two pretrained models (T5 and GPT-Neo). Our results show that fine-tuning using CDPG robustly moves these pretrained models closer towards meeting control objectives and \u2014 in contrast with baseline approaches \u2014 does not result in catastrophic forgetting.", "bibtex": "@InProceedings{pmlr-v162-korbak22a,\n title = \t {Controlling Conditional Language Models without Catastrophic Forgetting},\n author = {Korbak, Tomasz and Elsahar, Hady and Kruszewski, German and Dymetman, Marc},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11499--11528},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/korbak22a/korbak22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/korbak22a.html},\n abstract = \t {Machine learning is shifting towards general-purpose pretrained generative models, trained in a self-supervised manner on large amounts of data, which can then be applied to solve a large number of tasks. However, due to their generic training methodology, these models often fail to meet some of the downstream requirements (e.g., hallucinations in abstractive summarization or style violations in code generation). This raises the important question of how to adapt pre-trained generative models to meet all requirements without destroying their general capabilities (\"catastrophic forgetting\"). Recent work has proposed to solve this problem by representing task-specific requirements through energy-based models (EBMs) and approximating these EBMs using distributional policy gradients (DPG). Despite its effectiveness, this approach is however limited to unconditional distributions. In this paper, we extend DPG to conditional tasks by proposing Conditional DPG (CDPG). We evaluate CDPG on four different control objectives across three tasks (translation, summarization and code generation) and two pretrained models (T5 and GPT-Neo). Our results show that fine-tuning using CDPG robustly moves these pretrained models closer towards meeting control objectives and \u2014 in contrast with baseline approaches \u2014 does not result in catastrophic forgetting.}\n}", "pdf": "https://proceedings.mlr.press/v162/korbak22a/korbak22a.pdf", "supp": "", "pdf_size": 1339850, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13215553222930646661&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "University of Sussex; Naver Labs Europe; Naver Labs Europe; Naver Labs Europe", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/korbak22a.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Sussex;NAVER LABS", "aff_unique_dep": ";", "aff_unique_url": "https://www.sussex.ac.uk;https://labs.naver.com", "aff_unique_abbr": "Sussex;NLE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;Unknown" }, { "title": "Convergence Rates of Non-Convex Stochastic Gradient Descent Under a Generic Lojasiewicz Condition and Local Smoothness", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18099", "id": "18099", "proceeding": "https://proceedings.mlr.press/v162/scaman22a.html", "poster": "/media/PosterPDFs/ICML%202022/c2839bed26321da8b466c80a032e4714.png?t=1657183747.0477767", "slides": "", "author_site": "Kevin Scaman, Cedric Malherbe, Ludovic DOS SANTOS", "author": "Kevin Scaman; Cedric Malherbe; Ludovic Dos Santos", "abstract": "Training over-parameterized neural networks involves the empirical minimization of highly non-convex objective functions. Recently, a large body of works provided theoretical evidence that, despite this non-convexity, properly initialized over-parameterized networks can converge to a zero training loss through the introduction of the Polyak-Lojasiewicz condition. However, these analyses are restricted to quadratic losses such as mean square error, and tend to indicate fast exponential convergence rates that are seldom observed in practice. In this work, we propose to extend these results by analyzing stochastic gradient descent under more generic Lojasiewicz conditions that are applicable to any convex loss function, thus extending the current theory to a larger panel of losses commonly used in practice such as cross-entropy. Moreover, our analysis provides high-probability bounds on the approximation error under sub-Gaussian gradient noise and only requires the local smoothness of the objective function, thus making it applicable to deep neural networks in realistic settings.", "bibtex": "@InProceedings{pmlr-v162-scaman22a,\n title = \t {Convergence Rates of Non-Convex Stochastic Gradient Descent Under a Generic Lojasiewicz Condition and Local Smoothness},\n author = {Scaman, Kevin and Malherbe, Cedric and Santos, Ludovic Dos},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19310--19327},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/scaman22a/scaman22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/scaman22a.html},\n abstract = \t {Training over-parameterized neural networks involves the empirical minimization of highly non-convex objective functions. Recently, a large body of works provided theoretical evidence that, despite this non-convexity, properly initialized over-parameterized networks can converge to a zero training loss through the introduction of the Polyak-Lojasiewicz condition. However, these analyses are restricted to quadratic losses such as mean square error, and tend to indicate fast exponential convergence rates that are seldom observed in practice. In this work, we propose to extend these results by analyzing stochastic gradient descent under more generic Lojasiewicz conditions that are applicable to any convex loss function, thus extending the current theory to a larger panel of losses commonly used in practice such as cross-entropy. Moreover, our analysis provides high-probability bounds on the approximation error under sub-Gaussian gradient noise and only requires the local smoothness of the objective function, thus making it applicable to deep neural networks in realistic settings.}\n}", "pdf": "https://proceedings.mlr.press/v162/scaman22a/scaman22a.pdf", "supp": "", "pdf_size": 882944, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10482114131481239616&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "DI ENS, \u00c9cole normale sup\u00e9rieure, CNRS, INRIA, PSL University+Huawei; Huawei Noah\u2019s Ark; Huawei Noah\u2019s Ark", "aff_domain": "inria.fr;huawei.com;huawei.com", "email": "inria.fr;huawei.com;huawei.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/scaman22a.html", "aff_unique_index": "0+1;1;1", "aff_unique_norm": "\u00c9cole Normale Sup\u00e9rieure;Huawei", "aff_unique_dep": "DI ENS;Huawei Technologies Co., Ltd.", "aff_unique_url": "https://www.ens.fr;https://www.huawei.com", "aff_unique_abbr": "ENS;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1", "aff_country_unique": "France;China" }, { "title": "Convergence and Recovery Guarantees of the K-Subspaces Method for Subspace Clustering", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16255", "id": "16255", "proceeding": "https://proceedings.mlr.press/v162/wang22r.html", "poster": "/media/PosterPDFs/ICML%202022/1819932ff5cf474f4f19e7c7024640c2_NNDxCHf.png?t=1657492219.488653", "slides": "", "author_site": "Peng Wang, Huikang Liu, Anthony Man-Cho So, Laura Balzano", "author": "Peng Wang; Huikang Liu; Anthony Man-Cho So; Laura Balzano", "abstract": "The K-subspaces (KSS) method is a generalization of the K-means method for subspace clustering. In this work, we present local convergence analysis and a recovery guarantee for KSS, assuming data are generated by the semi-random union of subspaces model, where $N$ points are randomly sampled from $K \\ge 2$ overlapping subspaces. We show that if the initial assignment of the KSS method lies within a neighborhood of a true clustering, it converges at a superlinear rate and finds the correct clustering within $\\Theta(\\log\\log N)$ iterations with high probability. Moreover, we propose a thresholding inner-product based spectral method for initialization and prove that it produces a point in this neighborhood. We also present numerical results of the studied method to support our theoretical developments.", "bibtex": "@InProceedings{pmlr-v162-wang22r,\n title = \t {Convergence and Recovery Guarantees of the K-Subspaces Method for Subspace Clustering},\n author = {Wang, Peng and Liu, Huikang and So, Anthony Man-Cho and Balzano, Laura},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22884--22918},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22r/wang22r.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22r.html},\n abstract = \t {The K-subspaces (KSS) method is a generalization of the K-means method for subspace clustering. In this work, we present local convergence analysis and a recovery guarantee for KSS, assuming data are generated by the semi-random union of subspaces model, where $N$ points are randomly sampled from $K \\ge 2$ overlapping subspaces. We show that if the initial assignment of the KSS method lies within a neighborhood of a true clustering, it converges at a superlinear rate and finds the correct clustering within $\\Theta(\\log\\log N)$ iterations with high probability. Moreover, we propose a thresholding inner-product based spectral method for initialization and prove that it produces a point in this neighborhood. We also present numerical results of the studied method to support our theoretical developments.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22r/wang22r.pdf", "supp": "", "pdf_size": 610985, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4190201275040810423&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Electrical Engineering and Computer Science, University of Michigan, Ann Arbor; Research Institute for Interdisciplinary Sciences, School of Information Management and Engineering, Shanghai University of Finance and Economics, Shanghai; Department of Systems Engineering and Engineering Management, The Chinese University of Hong Kong, Hong Kong; Department of Electrical Engineering and Computer Science, University of Michigan, Ann Arbor", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/wang22r.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Michigan;Shanghai University of Finance and Economics;Chinese University of Hong Kong", "aff_unique_dep": "Department of Electrical Engineering and Computer Science;School of Information Management and Engineering;Department of Systems Engineering and Engineering Management", "aff_unique_url": "https://www.umich.edu;http://www.sufe.edu.cn;https://www.cuhk.edu.hk", "aff_unique_abbr": "UM;SUFE;CUHK", "aff_campus_unique_index": "0;1;2;0", "aff_campus_unique": "Ann Arbor;Shanghai;Hong Kong SAR", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;China" }, { "title": "Convergence of Invariant Graph Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16105", "id": "16105", "proceeding": "https://proceedings.mlr.press/v162/cai22b.html", "poster": "/media/PosterPDFs/ICML%202022/49c0b9d84c2a16fcaf9d25694fda75e1.png?t=1657494995.070906", "slides": "", "author_site": "Chen Cai, Yusu Wang", "author": "Chen Cai; Yusu Wang", "abstract": "Although theoretical properties such as expressive power and over-smoothing of graph neural networks (GNN) have been extensively studied recently, its convergence property is a relatively new direction. In this paper, we investigate the convergence of one powerful GNN, Invariant Graph Network (IGN) over graphs sampled from graphons. We first prove the stability of linear layers for general $k$-IGN (of order $k$) based on a novel interpretation of linear equivariant layers. Building upon this result, we prove the convergence of $k$-IGN under the model of \\citet{ruiz2020graphon}, where we access the edge weight but the convergence error is measured for graphon inputs. Under the more natural (and more challenging) setting of \\citet{keriven2020convergence} where one can only access 0-1 adjacency matrix sampled according to edge probability, we first show a negative result that the convergence of any IGN is not possible. We then obtain the convergence of a subset of IGNs, denoted as IGN-small, after the edge probability estimation. We show that IGN-small still contains function class rich enough that can approximate spectral GNNs arbitrarily well. Lastly, we perform experiments on various graphon models to verify our statements.", "bibtex": "@InProceedings{pmlr-v162-cai22b,\n title = \t {Convergence of Invariant Graph Networks},\n author = {Cai, Chen and Wang, Yusu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2457--2484},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cai22b/cai22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/cai22b.html},\n abstract = \t {Although theoretical properties such as expressive power and over-smoothing of graph neural networks (GNN) have been extensively studied recently, its convergence property is a relatively new direction. In this paper, we investigate the convergence of one powerful GNN, Invariant Graph Network (IGN) over graphs sampled from graphons. We first prove the stability of linear layers for general $k$-IGN (of order $k$) based on a novel interpretation of linear equivariant layers. Building upon this result, we prove the convergence of $k$-IGN under the model of \\citet{ruiz2020graphon}, where we access the edge weight but the convergence error is measured for graphon inputs. Under the more natural (and more challenging) setting of \\citet{keriven2020convergence} where one can only access 0-1 adjacency matrix sampled according to edge probability, we first show a negative result that the convergence of any IGN is not possible. We then obtain the convergence of a subset of IGNs, denoted as IGN-small, after the edge probability estimation. We show that IGN-small still contains function class rich enough that can approximate spectral GNNs arbitrarily well. Lastly, we perform experiments on various graphon models to verify our statements.}\n}", "pdf": "https://proceedings.mlr.press/v162/cai22b/cai22b.pdf", "supp": "", "pdf_size": 1673665, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9663144312739556296&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "University of California San Diego, San Diego, USA; University of California San Diego, San Diego, USA", "aff_domain": "ucsd.edu; ", "email": "ucsd.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/cai22b.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Convergence of Policy Gradient for Entropy Regularized MDPs with Neural Network Approximation in the Mean-Field Regime", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16285", "id": "16285", "proceeding": "https://proceedings.mlr.press/v162/leahy22a.html", "poster": "", "slides": "", "author_site": "James-Michael Leahy, Bekzhan Kerimkulov, David Siska, Lukasz Szpruch", "author": "James-Michael Leahy; Bekzhan Kerimkulov; David Siska; Lukasz Szpruch", "abstract": "We study the global convergence of policy gradient for infinite-horizon, continuous state and action space, and entropy-regularized Markov decision processes (MDPs). We consider a softmax policy with (one-hidden layer) neural network approximation in a mean-field regime. Additional entropic regularization in the associated mean-field probability measure is added, and the corresponding gradient flow is studied in the 2-Wasserstein metric. We show that the objective function is increasing along the gradient flow. Further, we prove that if the regularization in terms of the mean-field measure is sufficient, the gradient flow converges exponentially fast to the unique stationary solution, which is the unique maximizer of the regularized MDP objective. Lastly, we study the sensitivity of the value function along the gradient flow with respect to regularization parameters and the initial condition. Our results rely on the careful analysis of the non-linear Fokker\u2013Planck\u2013Kolmogorov equation and extend the pioneering work of \\cite{mei2020global} and \\cite{agarwal2020optimality}, which quantify the global convergence rate of policy gradient for entropy-regularized MDPs in the tabular setting.", "bibtex": "@InProceedings{pmlr-v162-leahy22a,\n title = \t {Convergence of Policy Gradient for Entropy Regularized {MDP}s with Neural Network Approximation in the Mean-Field Regime},\n author = {Leahy, James-Michael and Kerimkulov, Bekzhan and Siska, David and Szpruch, Lukasz},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12222--12252},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/leahy22a/leahy22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/leahy22a.html},\n abstract = \t {We study the global convergence of policy gradient for infinite-horizon, continuous state and action space, and entropy-regularized Markov decision processes (MDPs). We consider a softmax policy with (one-hidden layer) neural network approximation in a mean-field regime. Additional entropic regularization in the associated mean-field probability measure is added, and the corresponding gradient flow is studied in the 2-Wasserstein metric. We show that the objective function is increasing along the gradient flow. Further, we prove that if the regularization in terms of the mean-field measure is sufficient, the gradient flow converges exponentially fast to the unique stationary solution, which is the unique maximizer of the regularized MDP objective. Lastly, we study the sensitivity of the value function along the gradient flow with respect to regularization parameters and the initial condition. Our results rely on the careful analysis of the non-linear Fokker\u2013Planck\u2013Kolmogorov equation and extend the pioneering work of \\cite{mei2020global} and \\cite{agarwal2020optimality}, which quantify the global convergence rate of policy gradient for entropy-regularized MDPs in the tabular setting.}\n}", "pdf": "https://proceedings.mlr.press/v162/leahy22a/leahy22a.pdf", "supp": "", "pdf_size": 498228, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=606177943254282261&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": "School of Mathematics, University of Edinburgh, Edinburgh, United Kingdom+The Alan Turing Institute, London, United Kingdom; Department of Mathematics, Imperial College London, London, United Kingdom; Vega Protocol, Gibraltar, Gibraltar+The Alan Turing Institute, London, United Kingdom; School of Mathematics, University of Edinburgh, Edinburgh, United Kingdom+The Alan Turing Institute, London, United Kingdom", "aff_domain": "ed.ac.uk;imperial.ac.uk;vegaprotocol.io;ed.ac.uk", "email": "ed.ac.uk;imperial.ac.uk;vegaprotocol.io;ed.ac.uk", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/leahy22a.html", "aff_unique_index": "0+1;2;3+1;0+1", "aff_unique_norm": "University of Edinburgh;Alan Turing Institute;Imperial College London;Vega Protocol", "aff_unique_dep": "School of Mathematics;;Department of Mathematics;", "aff_unique_url": "https://www.ed.ac.uk;https://www.turing.ac.uk;https://www.imperial.ac.uk;", "aff_unique_abbr": "Edinburgh;ATI;Imperial College;", "aff_campus_unique_index": "0+1;1;1;0+1", "aff_campus_unique": "Edinburgh;London;", "aff_country_unique_index": "0+0;0;1+0;0+0", "aff_country_unique": "United Kingdom;Gibraltar" }, { "title": "Convergence of Uncertainty Sampling for Active Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17185", "id": "17185", "proceeding": "https://proceedings.mlr.press/v162/raj22a.html", "poster": "", "slides": "", "author_site": "Anant Raj, Francis Bach", "author": "Anant Raj; Francis Bach", "abstract": "Uncertainty sampling in active learning is heavily used in practice to reduce the annotation cost. However, there has been no wide consensus on the function to be used for uncertainty estimation in binary classification tasks and convergence guarantees of the corresponding active learning algorithms are not well understood. The situation is even more challenging for multi-category classification. In this work, we propose an efficient uncertainty estimator for binary classification which we also extend to multiple classes, and provide a non-asymptotic rate of convergence for our uncertainty sampling based active learning algorithm in both cases under no-noise conditions (i.e., linearly separable data). We also extend our analysis to the noisy case and provide theoretical guarantees for our algorithm under the influence of noise in the task of binary and multi-class classification.", "bibtex": "@InProceedings{pmlr-v162-raj22a,\n title = \t {Convergence of Uncertainty Sampling for Active Learning},\n author = {Raj, Anant and Bach, Francis},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18310--18331},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/raj22a/raj22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/raj22a.html},\n abstract = \t {Uncertainty sampling in active learning is heavily used in practice to reduce the annotation cost. However, there has been no wide consensus on the function to be used for uncertainty estimation in binary classification tasks and convergence guarantees of the corresponding active learning algorithms are not well understood. The situation is even more challenging for multi-category classification. In this work, we propose an efficient uncertainty estimator for binary classification which we also extend to multiple classes, and provide a non-asymptotic rate of convergence for our uncertainty sampling based active learning algorithm in both cases under no-noise conditions (i.e., linearly separable data). We also extend our analysis to the noisy case and provide theoretical guarantees for our algorithm under the influence of noise in the task of binary and multi-class classification.}\n}", "pdf": "https://proceedings.mlr.press/v162/raj22a/raj22a.pdf", "supp": "", "pdf_size": 567035, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7938082596308224912&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Inria, Ecole Normale Sup\u00e9rieure, PSL Research University, Paris, France+Department of Electrical and Computer Engineering, Coordinated Science Laboratory University of Illinois at Urbana-Champaign, USA; Inria, Ecole Normale Sup\u00e9rieure, PSL Research University, Paris, France", "aff_domain": "inria.fr; ", "email": "inria.fr; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/raj22a.html", "aff_unique_index": "0+1;0", "aff_unique_norm": "INRIA;University of Illinois Urbana-Champaign", "aff_unique_dep": ";Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.inria.fr;https://illinois.edu", "aff_unique_abbr": "Inria;UIUC", "aff_campus_unique_index": "0+1;0", "aff_campus_unique": "Paris;Urbana-Champaign", "aff_country_unique_index": "0+1;0", "aff_country_unique": "France;United States" }, { "title": "Convolutional and Residual Networks Provably Contain Lottery Tickets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17731", "id": "17731", "proceeding": "https://proceedings.mlr.press/v162/burkholz22a.html", "poster": "/media/PosterPDFs/ICML%202022/5b8add2a5d98b1a652ea7fd72d942dac.png?t=1657263376.3351355", "slides": "", "author": "Rebekka Burkholz", "abstract": "The Lottery Ticket Hypothesis continues to have a profound practical impact on the quest for small scale deep neural networks that solve modern deep learning tasks at competitive performance. These lottery tickets are identified by pruning large randomly initialized neural networks with architectures that are as diverse as their applications. Yet, theoretical insights that attest their existence have been mostly focused on deed fully-connected feed forward networks with ReLU activation functions. We prove that also modern architectures consisting of convolutional and residual layers that can be equipped with almost arbitrary activation functions can contain lottery tickets with high probability.", "bibtex": "@InProceedings{pmlr-v162-burkholz22a,\n title = \t {Convolutional and Residual Networks Provably Contain Lottery Tickets},\n author = {Burkholz, Rebekka},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2414--2433},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/burkholz22a/burkholz22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/burkholz22a.html},\n abstract = \t {The Lottery Ticket Hypothesis continues to have a profound practical impact on the quest for small scale deep neural networks that solve modern deep learning tasks at competitive performance. These lottery tickets are identified by pruning large randomly initialized neural networks with architectures that are as diverse as their applications. Yet, theoretical insights that attest their existence have been mostly focused on deed fully-connected feed forward networks with ReLU activation functions. We prove that also modern architectures consisting of convolutional and residual layers that can be equipped with almost arbitrary activation functions can contain lottery tickets with high probability.}\n}", "pdf": "https://proceedings.mlr.press/v162/burkholz22a/burkholz22a.pdf", "supp": "", "pdf_size": 2043368, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5628902554987130523&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "CISPA Helmholtz Center for Information Security, Saarbr\u00fccken, Germany", "aff_domain": "cispa.de", "email": "cispa.de", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/burkholz22a.html", "aff_unique_index": "0", "aff_unique_norm": "CISPA Helmholtz Center for Information Security", "aff_unique_dep": "", "aff_unique_url": "https://www.cispa.de", "aff_unique_abbr": "CISPA", "aff_campus_unique_index": "0", "aff_campus_unique": "Saarbr\u00fccken", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "Cooperative Online Learning in Stochastic and Adversarial MDPs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16099", "id": "16099", "proceeding": "https://proceedings.mlr.press/v162/lancewicki22a.html", "poster": "/media/PosterPDFs/ICML%202022/838e8afb1ca34354ac209f53d90c3a43.png?t=1657630350.355903", "slides": "", "author_site": "Tal Lancewicki, Aviv Rosenberg, Yishay Mansour", "author": "Tal Lancewicki; Aviv Rosenberg; Yishay Mansour", "abstract": "We study cooperative online learning in stochastic and adversarial Markov decision process (MDP). That is, in each episode, $m$ agents interact with an MDP simultaneously and share information in order to minimize their individual regret. We consider environments with two types of randomness:", "bibtex": "@InProceedings{pmlr-v162-lancewicki22a,\n title = \t {Cooperative Online Learning in Stochastic and Adversarial {MDP}s},\n author = {Lancewicki, Tal and Rosenberg, Aviv and Mansour, Yishay},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11918--11968},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lancewicki22a/lancewicki22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lancewicki22a.html},\n abstract = \t {We study cooperative online learning in stochastic and adversarial Markov decision process (MDP). That is, in each episode, $m$ agents interact with an MDP simultaneously and share information in order to minimize their individual regret. We consider environments with two types of randomness:", "pdf": "https://proceedings.mlr.press/v162/lancewicki22a/lancewicki22a.pdf", "supp": "", "pdf_size": 604823, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15737038244347984259&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Tel Aviv University; Tel Aviv University; Tel Aviv University + Google Research, Tel Aviv", "aff_domain": "mail.tau.ac.il; ; ", "email": "mail.tau.ac.il; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/lancewicki22a.html", "aff_unique_index": "0;0;0+1", "aff_unique_norm": "Tel Aviv University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.tau.ac.il;https://research.google", "aff_unique_abbr": "TAU;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Tel Aviv", "aff_country_unique_index": "0;0;0+0", "aff_country_unique": "Israel" }, { "title": "Coordinated Attacks against Contextual Bandits: Fundamental Limits and Defense Mechanisms", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16913", "id": "16913", "proceeding": "https://proceedings.mlr.press/v162/kwon22a.html", "poster": "", "slides": "", "author_site": "Jeongyeol Kwon, Yonathan Efroni, Constantine Caramanis, Shie Mannor", "author": "Jeongyeol Kwon; Yonathan Efroni; Constantine Caramanis; Shie Mannor", "abstract": "Motivated by online recommendation systems, we propose the problem of finding the optimal policy in multitask contextual bandits when a small fraction $\\alpha < 1/2$ of tasks (users) are arbitrary and adversarial. The remaining fraction of good users share the same instance of contextual bandits with $S$ contexts and $A$ actions (items). Naturally, whether a user is good or adversarial is not known in advance. The goal is to robustly learn the policy that maximizes rewards for good users with as few user interactions as possible. Without adversarial users, established results in collaborative filtering show that $O(1/\\epsilon^2)$ per-user interactions suffice to learn a good policy, precisely because information can be shared across users. This parallelization gain is fundamentally altered by the presence of adversarial users: unless there are super-polynomial number of users, we show a lower bound of $\\tilde{\\Omega}(\\min(S,A) \\cdot \\alpha^2 / \\epsilon^2)$", "bibtex": "@InProceedings{pmlr-v162-kwon22a,\n title = \t {Coordinated Attacks against Contextual Bandits: Fundamental Limits and Defense Mechanisms},\n author = {Kwon, Jeongyeol and Efroni, Yonathan and Caramanis, Constantine and Mannor, Shie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11772--11789},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kwon22a/kwon22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kwon22a.html},\n abstract = \t {Motivated by online recommendation systems, we propose the problem of finding the optimal policy in multitask contextual bandits when a small fraction $\\alpha < 1/2$ of tasks (users) are arbitrary and adversarial. The remaining fraction of good users share the same instance of contextual bandits with $S$ contexts and $A$ actions (items). Naturally, whether a user is good or adversarial is not known in advance. The goal is to robustly learn the policy that maximizes rewards for good users with as few user interactions as possible. Without adversarial users, established results in collaborative filtering show that $O(1/\\epsilon^2)$ per-user interactions suffice to learn a good policy, precisely because information can be shared across users. This parallelization gain is fundamentally altered by the presence of adversarial users: unless there are super-polynomial number of users, we show a lower bound of $\\tilde{\\Omega}(\\min(S,A) \\cdot \\alpha^2 / \\epsilon^2)$", "pdf": "https://proceedings.mlr.press/v162/kwon22a/kwon22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/kwon22a-supp.zip", "pdf_size": 540325, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8173997044023481173&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Electrical and Computer Engineering, University of Texas at Austin, USA+Microsoft Research, New York; Microsoft Research, New York; Technion, Israel+Nvidia Research; Technion, Israel+Nvidia Research", "aff_domain": "utexas.edu;gmail.com; ; ", "email": "utexas.edu;gmail.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/kwon22a.html", "aff_unique_index": "0+1;1;2+3;2+3", "aff_unique_norm": "University of Texas at Austin;Microsoft;Technion - Israel Institute of Technology;NVIDIA", "aff_unique_dep": "Department of Electrical and Computer Engineering;Microsoft Research;;NVIDIA Research", "aff_unique_url": "https://www.utexas.edu;https://www.microsoft.com/en-us/research;https://www.technion.ac.il/en/;https://www.nvidia.com/research", "aff_unique_abbr": "UT Austin;MSR;Technion;NVIDIA", "aff_campus_unique_index": "0+1;1;;", "aff_campus_unique": "Austin;New York;", "aff_country_unique_index": "0+0;0;1+0;1+0", "aff_country_unique": "United States;Israel" }, { "title": "Coordinated Double Machine Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16849", "id": "16849", "proceeding": "https://proceedings.mlr.press/v162/fingerhut22a.html", "poster": "/media/PosterPDFs/ICML%202022/ffedf5be3a86e2ee281d54cdc97bc1cf.png?t=1657615057.1940558", "slides": "", "author_site": "Nitai Fingerhut, Matteo Sesia, Yaniv Romano", "author": "Nitai Fingerhut; Matteo Sesia; Yaniv Romano", "abstract": "Double machine learning is a statistical method for leveraging complex black-box models to construct approximately unbiased treatment effect estimates given observational data with high-dimensional covariates, under the assumption of a partially linear model. The idea is to first fit on a subset of the samples two non-linear predictive models, one for the continuous outcome of interest and one for the observed treatment, and then to estimate a linear coefficient for the treatment using the remaining samples through a simple orthogonalized regression. While this methodology is flexible and can accommodate arbitrary predictive models, typically trained independently of one another, this paper argues that a carefully coordinated learning algorithm for deep neural networks may reduce the estimation bias. The improved empirical performance of the proposed method is demonstrated through numerical experiments on both simulated and real data.", "bibtex": "@InProceedings{pmlr-v162-fingerhut22a,\n title = \t {Coordinated Double Machine Learning},\n author = {Fingerhut, Nitai and Sesia, Matteo and Romano, Yaniv},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6499--6513},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fingerhut22a/fingerhut22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/fingerhut22a.html},\n abstract = \t {Double machine learning is a statistical method for leveraging complex black-box models to construct approximately unbiased treatment effect estimates given observational data with high-dimensional covariates, under the assumption of a partially linear model. The idea is to first fit on a subset of the samples two non-linear predictive models, one for the continuous outcome of interest and one for the observed treatment, and then to estimate a linear coefficient for the treatment using the remaining samples through a simple orthogonalized regression. While this methodology is flexible and can accommodate arbitrary predictive models, typically trained independently of one another, this paper argues that a carefully coordinated learning algorithm for deep neural networks may reduce the estimation bias. The improved empirical performance of the proposed method is demonstrated through numerical experiments on both simulated and real data.}\n}", "pdf": "https://proceedings.mlr.press/v162/fingerhut22a/fingerhut22a.pdf", "supp": "", "pdf_size": 1079777, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3098806630799952921&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Departments of Electrical and Computer Engineering and of Computer Science, Technion, Israel; Department of Data Sciences and Operations, University of Southern California, CA, USA; Departments of Electrical and Computer Engineering and of Computer Science, Technion, Israel", "aff_domain": "campus.technion.ac.il; ; ", "email": "campus.technion.ac.il; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/fingerhut22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Technion;University of Southern California", "aff_unique_dep": "Departments of Electrical and Computer Engineering and of Computer Science;Department of Data Sciences and Operations", "aff_unique_url": "https://www.technion.ac.il;https://www.usc.edu", "aff_unique_abbr": "Technion;USC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Israel;United States" }, { "title": "Correct-N-Contrast: a Contrastive Approach for Improving Robustness to Spurious Correlations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18223", "id": "18223", "proceeding": "https://proceedings.mlr.press/v162/zhang22z.html", "poster": "/media/PosterPDFs/ICML%202022/1e8a19426224ca89e83cef47f1e7f53b.png?t=1658284528.0867174", "slides": "", "author_site": "Michael Zhang, Nimit Sohoni, Hongyang Zhang, Chelsea Finn, Christopher Re", "author": "Michael Zhang; Nimit S Sohoni; Hongyang R Zhang; Chelsea Finn; Christopher Re", "abstract": "Spurious correlations pose a major challenge for robust machine learning. Models trained with empirical risk minimization (ERM) may learn to rely on correlations between class labels and spurious attributes, leading to poor performance on data groups without these correlations. This is challenging to address when the spurious attribute labels are unavailable. To improve worst-group performance on spuriously correlated data without training attribute labels, we propose Correct-N-Contrast (CNC), a contrastive approach to directly learn representations robust to spurious correlations. As ERM models can be good spurious attribute predictors, CNC works by (1) using a trained ERM model\u2019s outputs to identify samples with the same class but dissimilar spurious features, and (2) training a robust model with contrastive learning to learn similar representations for these samples. To support CNC, we introduce new connections between worst-group error and a representation alignment loss that CNC aims to minimize. We empirically observe that worst-group error closely tracks with alignment loss, and prove that the alignment loss over a class helps upper-bound the class\u2019s worst-group vs. average error gap. On popular benchmarks, CNC reduces alignment loss drastically, and achieves state-of-the-art worst-group accuracy by 3.6% average absolute lift. CNC is also competitive with oracle methods that require group labels.", "bibtex": "@InProceedings{pmlr-v162-zhang22z,\n title = \t {Correct-N-Contrast: a Contrastive Approach for Improving Robustness to Spurious Correlations},\n author = {Zhang, Michael and Sohoni, Nimit S and Zhang, Hongyang R and Finn, Chelsea and Re, Christopher},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26484--26516},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22z/zhang22z.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22z.html},\n abstract = \t {Spurious correlations pose a major challenge for robust machine learning. Models trained with empirical risk minimization (ERM) may learn to rely on correlations between class labels and spurious attributes, leading to poor performance on data groups without these correlations. This is challenging to address when the spurious attribute labels are unavailable. To improve worst-group performance on spuriously correlated data without training attribute labels, we propose Correct-N-Contrast (CNC), a contrastive approach to directly learn representations robust to spurious correlations. As ERM models can be good spurious attribute predictors, CNC works by (1) using a trained ERM model\u2019s outputs to identify samples with the same class but dissimilar spurious features, and (2) training a robust model with contrastive learning to learn similar representations for these samples. To support CNC, we introduce new connections between worst-group error and a representation alignment loss that CNC aims to minimize. We empirically observe that worst-group error closely tracks with alignment loss, and prove that the alignment loss over a class helps upper-bound the class\u2019s worst-group vs. average error gap. On popular benchmarks, CNC reduces alignment loss drastically, and achieves state-of-the-art worst-group accuracy by 3.6% average absolute lift. CNC is also competitive with oracle methods that require group labels.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22z/zhang22z.pdf", "supp": "", "pdf_size": 17066872, "gs_citation": 201, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8960959356014477531&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Stanford University; Stanford University; Northeastern University; Stanford University; Stanford University", "aff_domain": "cs.stanford.edu; ; ; ; ", "email": "cs.stanford.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zhang22z.html", "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Stanford University;Northeastern University", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.northeastern.edu", "aff_unique_abbr": "Stanford;NEU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Correlated Quantization for Distributed Mean Estimation and Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17025", "id": "17025", "proceeding": "https://proceedings.mlr.press/v162/suresh22a.html", "poster": "/media/PosterPDFs/ICML%202022/ff7a2112f8c3e3224ce8e3e26de1d932.png?t=1657820195.270417", "slides": "/media/icml-2022/Slides/17025.pdf", "author_site": "Ananda Suresh, Ziteng Sun, Jae Ro, Felix Xinnan Yu", "author": "Ananda Theertha Suresh; Ziteng Sun; Jae Ro; Felix Yu", "abstract": "We study the problem of distributed mean estimation and optimization under communication constraints. We propose a correlated quantization protocol whose error guarantee depends on the deviation of data points instead of their absolute range. The design doesn\u2019t need any prior knowledge on the concentration property of the dataset, which is required to get such dependence in previous works. We show that applying the proposed protocol as a sub-routine in distributed optimization algorithms leads to better convergence rates. We also prove the optimality of our protocol under mild assumptions. Experimental results show that our proposed algorithm outperforms existing mean estimation protocols on a diverse set of tasks.", "bibtex": "@InProceedings{pmlr-v162-suresh22a,\n title = \t {Correlated Quantization for Distributed Mean Estimation and Optimization},\n author = {Suresh, Ananda Theertha and Sun, Ziteng and Ro, Jae and Yu, Felix},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20856--20876},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/suresh22a/suresh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/suresh22a.html},\n abstract = \t {We study the problem of distributed mean estimation and optimization under communication constraints. We propose a correlated quantization protocol whose error guarantee depends on the deviation of data points instead of their absolute range. The design doesn\u2019t need any prior knowledge on the concentration property of the dataset, which is required to get such dependence in previous works. We show that applying the proposed protocol as a sub-routine in distributed optimization algorithms leads to better convergence rates. We also prove the optimality of our protocol under mild assumptions. Experimental results show that our proposed algorithm outperforms existing mean estimation protocols on a diverse set of tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/suresh22a/suresh22a.pdf", "supp": "", "pdf_size": 431350, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7109782332404494753&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": "Google Research, New York; Google Research, New York; Google Research, New York; Google Research, New York", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/suresh22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Correlation Clustering via Strong Triadic Closure Labeling: Fast Approximation Algorithms and Practical Lower Bounds", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16321", "id": "16321", "proceeding": "https://proceedings.mlr.press/v162/veldt22a.html", "poster": "/media/PosterPDFs/ICML%202022/8b0d268963dd0cfb808aac48a549829f.png?t=1657896787.0506015", "slides": "", "author": "Nate Veldt", "abstract": "Correlation clustering is a widely studied framework for clustering based on pairwise similarity and dissimilarity scores, but its best approximation algorithms rely on impractical linear programming relaxations. We present faster approximation algorithms that avoid these relaxations, for two well-studied special cases: cluster editing and cluster deletion. We accomplish this by drawing new connections to edge labeling problems related to the principle of strong triadic closure. This leads to faster and more practical linear programming algorithms, as well as extremely scalable combinatorial techniques, including the first combinatorial approximation algorithm for cluster deletion. In practice, our algorithms produce approximate solutions that nearly match the best algorithms in quality, while scaling to problems that are orders of magnitude larger.", "bibtex": "@InProceedings{pmlr-v162-veldt22a,\n title = \t {Correlation Clustering via Strong Triadic Closure Labeling: Fast Approximation Algorithms and Practical Lower Bounds},\n author = {Veldt, Nate},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22060--22083},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/veldt22a/veldt22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/veldt22a.html},\n abstract = \t {Correlation clustering is a widely studied framework for clustering based on pairwise similarity and dissimilarity scores, but its best approximation algorithms rely on impractical linear programming relaxations. We present faster approximation algorithms that avoid these relaxations, for two well-studied special cases: cluster editing and cluster deletion. We accomplish this by drawing new connections to edge labeling problems related to the principle of strong triadic closure. This leads to faster and more practical linear programming algorithms, as well as extremely scalable combinatorial techniques, including the first combinatorial approximation algorithm for cluster deletion. In practice, our algorithms produce approximate solutions that nearly match the best algorithms in quality, while scaling to problems that are orders of magnitude larger.}\n}", "pdf": "https://proceedings.mlr.press/v162/veldt22a/veldt22a.pdf", "supp": "", "pdf_size": 664709, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18023293593694212775&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Computer Science and Engineering, Texas A&M University, College Station, Texas, USA", "aff_domain": "tamu.edu", "email": "tamu.edu", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/veldt22a.html", "aff_unique_index": "0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "0", "aff_campus_unique": "College Station", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Counterfactual Prediction for Outcome-Oriented Treatments", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17333", "id": "17333", "proceeding": "https://proceedings.mlr.press/v162/zou22a.html", "poster": "/media/PosterPDFs/ICML%202022/630eff1b380505a67570dff952ce4ad7.png?t=1657614905.9241936", "slides": "", "author_site": "Hao Zou, Bo Li, Jiangang Han, Shuiping Chen, Xuetao Ding, Peng Cui", "author": "Hao Zou; Bo Li; Jiangang Han; Shuiping Chen; Xuetao Ding; Peng Cui", "abstract": "Large amounts of efforts have been devoted into learning counterfactual treatment outcome under various settings, including binary/continuous/multiple treatments. Most of these literature aims to minimize the estimation error of counterfactual outcome for the whole treatment space. However, in most scenarios when the counterfactual prediction model is utilized to assist decision-making, people are only concerned with the small fraction of treatments that can potentially induce superior outcome (i.e. outcome-oriented treatments). This gap of objective is even more severe when the number of possible treatments is large, for example under the continuous treatment setting. To overcome it, we establish a new objective of optimizing counterfactual prediction on outcome-oriented treatments, propose a novel Outcome-Oriented Sample Re-weighting (OOSR) method to make the predictive model concentrate more on outcome-oriented treatments, and theoretically analyze that our method can improve treatment selection towards the optimal one. Extensive experimental results on both synthetic datasets and semi-synthetic datasets demonstrate the effectiveness of our method.", "bibtex": "@InProceedings{pmlr-v162-zou22a,\n title = \t {Counterfactual Prediction for Outcome-Oriented Treatments},\n author = {Zou, Hao and Li, Bo and Han, Jiangang and Chen, Shuiping and Ding, Xuetao and Cui, Peng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27693--27706},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zou22a/zou22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zou22a.html},\n abstract = \t {Large amounts of efforts have been devoted into learning counterfactual treatment outcome under various settings, including binary/continuous/multiple treatments. Most of these literature aims to minimize the estimation error of counterfactual outcome for the whole treatment space. However, in most scenarios when the counterfactual prediction model is utilized to assist decision-making, people are only concerned with the small fraction of treatments that can potentially induce superior outcome (i.e. outcome-oriented treatments). This gap of objective is even more severe when the number of possible treatments is large, for example under the continuous treatment setting. To overcome it, we establish a new objective of optimizing counterfactual prediction on outcome-oriented treatments, propose a novel Outcome-Oriented Sample Re-weighting (OOSR) method to make the predictive model concentrate more on outcome-oriented treatments, and theoretically analyze that our method can improve treatment selection towards the optimal one. Extensive experimental results on both synthetic datasets and semi-synthetic datasets demonstrate the effectiveness of our method.}\n}", "pdf": "https://proceedings.mlr.press/v162/zou22a/zou22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/zou22a-supp.zip", "pdf_size": 482164, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17915648538887715545&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science and Technology, Tsinghua University, Beijing, China; School of Economics and Management, Tsinghua University, Beijing, China; Meituan, Beijing, China; Meituan, Beijing, China; Meituan, Beijing, China; Department of Computer Science and Technology, Tsinghua University, Beijing, China", "aff_domain": "mails.tsinghua.edu.cn;sem.tsinghua.edu.cn;meituan.com;meituan.com;meituan.com;tsinghua.edu.cn", "email": "mails.tsinghua.edu.cn;sem.tsinghua.edu.cn;meituan.com;meituan.com;meituan.com;tsinghua.edu.cn", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zou22a.html", "aff_unique_index": "0;0;1;1;1;0", "aff_unique_norm": "Tsinghua University;Meituan", "aff_unique_dep": "Department of Computer Science and Technology;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.meituan.com", "aff_unique_abbr": "THU;Meituan", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Counterfactual Transportability: A Formal Approach", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16747", "id": "16747", "proceeding": "https://proceedings.mlr.press/v162/correa22a.html", "poster": "/media/PosterPDFs/ICML%202022/328347805873e9a9c700591812fb0ec2.png?t=1658263580.546062", "slides": "", "author_site": "Juan Correa, Sanghack Lee, Elias Bareinboim", "author": "Juan D Correa; Sanghack Lee; Elias Bareinboim", "abstract": "Generalizing causal knowledge across environments is a common challenge shared across many of the data-driven disciplines, including AI and ML. Experiments are usually performed in one environment (e.g., in a lab, on Earth, in a training ground), almost invariably, with the intent of being used elsewhere (e.g., outside the lab, on Mars, in the real world), in an environment that is related but somewhat different than the original one, where certain conditions and mechanisms are likely to change. This generalization task has been studied in the causal inference literature under the rubric of transportability (Pearl and Bareinboim, 2011). While most transportability works focused on generalizing associational and interventional distributions, the generalization of counterfactual distributions has not been formally studied. In this paper, we investigate the transportability of counterfactuals from an arbitrary combination of observational and experimental distributions coming from disparate domains. Specifically, we introduce a sufficient and necessary graphical condition and develop an efficient, sound, and complete algorithm for transporting counterfactual quantities across domains in nonparametric settings. Failure of the algorithm implies the impossibility of generalizing the target counterfactual from the available data without further assumptions.", "bibtex": "@InProceedings{pmlr-v162-correa22a,\n title = \t {Counterfactual Transportability: A Formal Approach},\n author = {Correa, Juan D and Lee, Sanghack and Bareinboim, Elias},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4370--4390},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/correa22a/correa22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/correa22a.html},\n abstract = \t {Generalizing causal knowledge across environments is a common challenge shared across many of the data-driven disciplines, including AI and ML. Experiments are usually performed in one environment (e.g., in a lab, on Earth, in a training ground), almost invariably, with the intent of being used elsewhere (e.g., outside the lab, on Mars, in the real world), in an environment that is related but somewhat different than the original one, where certain conditions and mechanisms are likely to change. This generalization task has been studied in the causal inference literature under the rubric of transportability (Pearl and Bareinboim, 2011). While most transportability works focused on generalizing associational and interventional distributions, the generalization of counterfactual distributions has not been formally studied. In this paper, we investigate the transportability of counterfactuals from an arbitrary combination of observational and experimental distributions coming from disparate domains. Specifically, we introduce a sufficient and necessary graphical condition and develop an efficient, sound, and complete algorithm for transporting counterfactual quantities across domains in nonparametric settings. Failure of the algorithm implies the impossibility of generalizing the target counterfactual from the available data without further assumptions.}\n}", "pdf": "https://proceedings.mlr.press/v162/correa22a/correa22a.pdf", "supp": "", "pdf_size": 483252, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13755894046993454950&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, Universidad Aut\u00f3noma de Manizales, Manizales, Colombia; Graduate School of Data Science, Seoul National University, Seoul, South Korea; Department of Computer Science, Columbia University, New York, USA", "aff_domain": "autonoma.edu.co; ; ", "email": "autonoma.edu.co; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/correa22a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Universidad Aut\u00f3noma de Manizales;Seoul National University;Columbia University", "aff_unique_dep": "Department of Computer Science;Graduate School of Data Science;Department of Computer Science", "aff_unique_url": "https://www.uam.edu.co;https://www.snu.ac.kr;https://www.columbia.edu", "aff_unique_abbr": ";SNU;Columbia", "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "Manizales;Seoul;New York", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Colombia;South Korea;United States" }, { "title": "Cross-Space Active Learning on Graph Convolutional Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18399", "id": "18399", "proceeding": "https://proceedings.mlr.press/v162/tao22a.html", "poster": "/media/PosterPDFs/ICML%202022/13ece95531e87921222a0f9d93230691.png?t=1657182835.7027004", "slides": "", "author_site": "Yufei Tao, Hao WU, Shiyuan Deng", "author": "Yufei Tao; Hao Wu; Shiyuan Deng", "abstract": "This paper formalizes", "bibtex": "@InProceedings{pmlr-v162-tao22a,\n title = \t {Cross-Space Active Learning on Graph Convolutional Networks},\n author = {Tao, Yufei and Wu, Hao and Deng, Shiyuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21133--21145},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tao22a/tao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tao22a.html},\n abstract = \t {This paper formalizes", "pdf": "https://proceedings.mlr.press/v162/tao22a/tao22a.pdf", "supp": "", "pdf_size": 387784, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10458041681260697192&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science and Engineering, Chinese University of Hong Kong, Hong Kong, China; Department of Computer Science and Engineering, Chinese University of Hong Kong, Hong Kong, China; Department of Computer Science and Engineering, Chinese University of Hong Kong, Hong Kong, China", "aff_domain": "cse.cuhk.edu.hk; ; ", "email": "cse.cuhk.edu.hk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/tao22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Chinese University of Hong Kong", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "CtrlFormer: Learning Transferable State Representation for Visual Control via Transformer", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18045", "id": "18045", "proceeding": "https://proceedings.mlr.press/v162/mu22a.html", "poster": "/media/PosterPDFs/ICML%202022/208e43f0e45c4c78cafadb83d2888cb6.png?t=1656610416.162284", "slides": "/media/icml-2022/Slides/18045.pdf", "author_site": "Yao Mu, Shoufa Chen, Mingyu Ding, Jianyu Chen, Runjian Chen, Ping Luo", "author": "Yao Mark Mu; Shoufa Chen; Mingyu Ding; Jianyu Chen; Runjian Chen; Ping Luo", "abstract": "Transformer has achieved great successes in learning vision and language representation, which is general across various downstream tasks. In visual control, learning transferable state representation that can transfer between different control tasks is important to reduce the training sample size. However, porting Transformer to sample-efficient visual control remains a challenging and unsolved problem. To this end, we propose a novel Control Transformer (CtrlFormer), possessing many appealing benefits that prior arts do not have. Firstly, CtrlFormer jointly learns self-attention mechanisms between visual tokens and policy tokens among different control tasks, where multitask representation can be learned and transferred without catastrophic forgetting. Secondly, we carefully design a contrastive reinforcement learning paradigm to train CtrlFormer, enabling it to achieve high sample efficiency, which is important in control problems. For example, in the DMControl benchmark, unlike recent advanced methods that failed by producing a zero score in the \u201cCartpole\u201d task after transfer learning with 100k samples, CtrlFormer can achieve a state-of-the-art score with only 100k samples while maintaining the performance of previous tasks. The code and models are released in our project homepage.", "bibtex": "@InProceedings{pmlr-v162-mu22a,\n title = \t {{C}trl{F}ormer: Learning Transferable State Representation for Visual Control via Transformer},\n author = {Mu, Yao Mark and Chen, Shoufa and Ding, Mingyu and Chen, Jianyu and Chen, Runjian and Luo, Ping},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16043--16061},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mu22a/mu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mu22a.html},\n abstract = \t {Transformer has achieved great successes in learning vision and language representation, which is general across various downstream tasks. In visual control, learning transferable state representation that can transfer between different control tasks is important to reduce the training sample size. However, porting Transformer to sample-efficient visual control remains a challenging and unsolved problem. To this end, we propose a novel Control Transformer (CtrlFormer), possessing many appealing benefits that prior arts do not have. Firstly, CtrlFormer jointly learns self-attention mechanisms between visual tokens and policy tokens among different control tasks, where multitask representation can be learned and transferred without catastrophic forgetting. Secondly, we carefully design a contrastive reinforcement learning paradigm to train CtrlFormer, enabling it to achieve high sample efficiency, which is important in control problems. For example, in the DMControl benchmark, unlike recent advanced methods that failed by producing a zero score in the \u201cCartpole\u201d task after transfer learning with 100k samples, CtrlFormer can achieve a state-of-the-art score with only 100k samples while maintaining the performance of previous tasks. The code and models are released in our project homepage.}\n}", "pdf": "https://proceedings.mlr.press/v162/mu22a/mu22a.pdf", "supp": "", "pdf_size": 11806147, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15994281746681133957&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, the University of Hong Kong, Hong Kong; Department of Computer Science, the University of Hong Kong, Hong Kong; Department of Computer Science, the University of Hong Kong, Hong Kong; Institute for Interdisciplinary Information Sciences (IIIS), Tsinghua University, Beijing, China; Department of Computer Science, the University of Hong Kong, Hong Kong; Department of Computer Science, the University of Hong Kong, Hong Kong", "aff_domain": "cs.hku.hk; ; ; ; ;cs.hku.hk", "email": "cs.hku.hk; ; ; ; ;cs.hku.hk", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/mu22a.html", "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "University of Hong Kong;Tsinghua University", "aff_unique_dep": "Department of Computer Science;Institute for Interdisciplinary Information Sciences (IIIS)", "aff_unique_url": "https://www.hku.hk;https://www.tsinghua.edu.cn", "aff_unique_abbr": "HKU;Tsinghua", "aff_campus_unique_index": "0;0;0;1;0;0", "aff_campus_unique": "Hong Kong SAR;Beijing", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Curriculum Reinforcement Learning via Constrained Optimal Transport", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16303", "id": "16303", "proceeding": "https://proceedings.mlr.press/v162/klink22a.html", "poster": "/media/PosterPDFs/ICML%202022/4c22bd444899d3b6047a10b20a2f26db.png?t=1657186514.0944455", "slides": "", "author_site": "Pascal Klink, Haoyi Yang, Carlo D'Eramo, Jan Peters, Joni Pajarinen", "author": "Pascal Klink; Haoyi Yang; Carlo D\u2019Eramo; Jan Peters; Joni Pajarinen", "abstract": "Curriculum reinforcement learning (CRL) allows solving complex tasks by generating a tailored sequence of learning tasks, starting from easy ones and subsequently increasing their difficulty. Although the potential of curricula in RL has been clearly shown in a variety of works, it is less clear how to generate them for a given learning environment, resulting in a variety of methods aiming to automate this task. In this work, we focus on the idea of framing curricula as interpolations between task distributions, which has previously been shown to be a viable approach to CRL. Identifying key issues of existing methods, we frame the generation of a curriculum as a constrained optimal transport problem between task distributions. Benchmarks show that this way of curriculum generation can improve upon existing CRL methods, yielding high performance in a variety of tasks with different characteristics.", "bibtex": "@InProceedings{pmlr-v162-klink22a,\n title = \t {Curriculum Reinforcement Learning via Constrained Optimal Transport},\n author = {Klink, Pascal and Yang, Haoyi and D'Eramo, Carlo and Peters, Jan and Pajarinen, Joni},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11341--11358},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/klink22a/klink22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/klink22a.html},\n abstract = \t {Curriculum reinforcement learning (CRL) allows solving complex tasks by generating a tailored sequence of learning tasks, starting from easy ones and subsequently increasing their difficulty. Although the potential of curricula in RL has been clearly shown in a variety of works, it is less clear how to generate them for a given learning environment, resulting in a variety of methods aiming to automate this task. In this work, we focus on the idea of framing curricula as interpolations between task distributions, which has previously been shown to be a viable approach to CRL. Identifying key issues of existing methods, we frame the generation of a curriculum as a constrained optimal transport problem between task distributions. Benchmarks show that this way of curriculum generation can improve upon existing CRL methods, yielding high performance in a variety of tasks with different characteristics.}\n}", "pdf": "https://proceedings.mlr.press/v162/klink22a/klink22a.pdf", "supp": "", "pdf_size": 2331841, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3952301864008711727&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Intelligent Autonomous Systems, Technical University of Darmstadt, Germany; Intelligent Autonomous Systems, Technical University of Darmstadt, Germany; Intelligent Autonomous Systems, Technical University of Darmstadt, Germany; Intelligent Autonomous Systems, Technical University of Darmstadt, Germany + Department of Electrical Engineering and Automation, Aalto University, Finland; Intelligent Autonomous Systems, Technical University of Darmstadt, Germany", "aff_domain": "tu-darmstadt.de; ; ; ; ", "email": "tu-darmstadt.de; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/klink22a.html", "aff_unique_index": "0;0;0;0+1;0", "aff_unique_norm": "Technical University of Darmstadt;Aalto University", "aff_unique_dep": "Intelligent Autonomous Systems;Department of Electrical Engineering and Automation", "aff_unique_url": "https://www.tu-darmstadt.de;https://www.aalto.fi", "aff_unique_abbr": "TUD;Aalto", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1;0", "aff_country_unique": "Germany;Finland" }, { "title": "Cycle Representation Learning for Inductive Relation Prediction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16857", "id": "16857", "proceeding": "https://proceedings.mlr.press/v162/yan22a.html", "poster": "/media/PosterPDFs/ICML%202022/2ecd2bd94734e5dd392d8678bc64cdab.png?t=1656597557.4947157", "slides": "", "author_site": "Zuoyu Yan, Tengfei Ma, Liangcai Gao, Zhi Tang, Chao Chen", "author": "Zuoyu Yan; Tengfei Ma; Liangcai Gao; Zhi Tang; Chao Chen", "abstract": "In recent years, algebraic topology and its modern development, the theory of persistent homology, has shown great potential in graph representation learning. In this paper, based on the mathematics of algebraic topology, we propose a novel solution for inductive relation prediction, an important learning task for knowledge graph completion. To predict the relation between two entities, one can use the existence of rules, namely a sequence of relations. Previous works view rules as paths and primarily focus on the searching of paths between entities. The space of rules is huge, and one has to sacrifice either efficiency or accuracy. In this paper, we consider rules as cycles and show that the space of cycles has a unique structure based on the mathematics of algebraic topology. By exploring the linear structure of the cycle space, we can improve the searching efficiency of rules. We propose to collect cycle bases that span the space of cycles. We build a novel GNN framework on the collected cycles to learn the representations of cycles, and to predict the existence/non-existence of a relation. Our method achieves state-of-the-art performance on benchmarks.", "bibtex": "@InProceedings{pmlr-v162-yan22a,\n title = \t {Cycle Representation Learning for Inductive Relation Prediction},\n author = {Yan, Zuoyu and Ma, Tengfei and Gao, Liangcai and Tang, Zhi and Chen, Chao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24895--24910},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yan22a/yan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yan22a.html},\n abstract = \t {In recent years, algebraic topology and its modern development, the theory of persistent homology, has shown great potential in graph representation learning. In this paper, based on the mathematics of algebraic topology, we propose a novel solution for inductive relation prediction, an important learning task for knowledge graph completion. To predict the relation between two entities, one can use the existence of rules, namely a sequence of relations. Previous works view rules as paths and primarily focus on the searching of paths between entities. The space of rules is huge, and one has to sacrifice either efficiency or accuracy. In this paper, we consider rules as cycles and show that the space of cycles has a unique structure based on the mathematics of algebraic topology. By exploring the linear structure of the cycle space, we can improve the searching efficiency of rules. We propose to collect cycle bases that span the space of cycles. We build a novel GNN framework on the collected cycles to learn the representations of cycles, and to predict the existence/non-existence of a relation. Our method achieves state-of-the-art performance on benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/yan22a/yan22a.pdf", "supp": "", "pdf_size": 993375, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2061126116449549118&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Wangxuan Institute of Computer Technology, Peking University, Beijing, China+T. J. Watson Research Center, IBM, New York, USA; T. J. Watson Research Center, IBM, New York, USA; Wangxuan Institute of Computer Technology, Peking University, Beijing, China; Wangxuan Institute of Computer Technology, Peking University, Beijing, China; Department of Biomedical Informatics, Stony Brook University, New York, USA", "aff_domain": "pku.edu.cn;ibm.com;pku.edu.cn;pku.edu.cn;stonybrook.edu", "email": "pku.edu.cn;ibm.com;pku.edu.cn;pku.edu.cn;stonybrook.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/yan22a.html", "aff_unique_index": "0+1;1;0;0;2", "aff_unique_norm": "Peking University;IBM;Stony Brook University", "aff_unique_dep": "Wangxuan Institute of Computer Technology;T. J. Watson Research Center;Department of Biomedical Informatics", "aff_unique_url": "http://www.pku.edu.cn;https://www.ibm.com;https://www.stonybrook.edu", "aff_unique_abbr": "PKU;IBM;SBU", "aff_campus_unique_index": "0+1;1;0;0;2", "aff_campus_unique": "Beijing;New York;Stony Brook", "aff_country_unique_index": "0+1;1;0;0;1", "aff_country_unique": "China;United States" }, { "title": "DAVINZ: Data Valuation using Deep Neural Networks at Initialization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17499", "id": "17499", "proceeding": "https://proceedings.mlr.press/v162/wu22j.html", "poster": "/media/PosterPDFs/ICML%202022/07ac7cd13fd0eb1654ccdbd222b81437.png?t=1657163447.2490058", "slides": "", "author_site": "Zhaoxuan Wu, Yao Shu, Bryan Kian Hsiang Low", "author": "Zhaoxuan Wu; Yao Shu; Bryan Kian Hsiang Low", "abstract": "Recent years have witnessed a surge of interest in developing trustworthy methods to evaluate the value of data in many real-world applications (e.g., collaborative machine learning, data marketplaces). Existing data valuation methods typically valuate data using the generalization performance of converged machine learning models after their long-term model training, hence making data valuation on large complex deep neural networks (DNNs) unaffordable. To this end, we theoretically derive a domain-aware generalization bound to estimate the generalization performance of DNNs without model training. We then exploit this theoretically derived generalization bound to develop a novel training-free data valuation method named data valuation at initialization (DAVINZ) on DNNs, which consistently achieves remarkable effectiveness and efficiency in practice. Moreover, our training-free DAVINZ, surprisingly, can even theoretically and empirically enjoy the desirable properties that training-based data valuation methods usually attain, thus making it more trustworthy in practice.", "bibtex": "@InProceedings{pmlr-v162-wu22j,\n title = \t {{DAVINZ}: Data Valuation using Deep Neural Networks at Initialization},\n author = {Wu, Zhaoxuan and Shu, Yao and Low, Bryan Kian Hsiang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24150--24176},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22j/wu22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22j.html},\n abstract = \t {Recent years have witnessed a surge of interest in developing trustworthy methods to evaluate the value of data in many real-world applications (e.g., collaborative machine learning, data marketplaces). Existing data valuation methods typically valuate data using the generalization performance of converged machine learning models after their long-term model training, hence making data valuation on large complex deep neural networks (DNNs) unaffordable. To this end, we theoretically derive a domain-aware generalization bound to estimate the generalization performance of DNNs without model training. We then exploit this theoretically derived generalization bound to develop a novel training-free data valuation method named data valuation at initialization (DAVINZ) on DNNs, which consistently achieves remarkable effectiveness and efficiency in practice. Moreover, our training-free DAVINZ, surprisingly, can even theoretically and empirically enjoy the desirable properties that training-based data valuation methods usually attain, thus making it more trustworthy in practice.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22j/wu22j.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wu22j-supp.zip", "pdf_size": 902669, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7732438577401529320&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Institute of Data Science, National University of Singapore, Republic of Singapore+Integrative Sciences and Engineering Programme, NUSGS, Republic of Singapore; Department of Computer Science, National University of Singapore, Republic of Singapore; Department of Computer Science, National University of Singapore, Republic of Singapore", "aff_domain": "comp.nus.edu.sg; ;comp.nus.edu.sg", "email": "comp.nus.edu.sg; ;comp.nus.edu.sg", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wu22j.html", "aff_unique_index": "0+0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "Institute of Data Science", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "Singapore" }, { "title": "DAdaQuant: Doubly-adaptive quantization for communication-efficient Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16009", "id": "16009", "proceeding": "https://proceedings.mlr.press/v162/honig22a.html", "poster": "/media/PosterPDFs/ICML%202022/f2e43fa3400d826df4195a9ac70dca62.png?t=1657277630.1442232", "slides": "/media/icml-2022/Slides/16009.pdf", "author_site": "Robert H\u00f6nig, Yiren Zhao, Robert Mullins", "author": "Robert H\u00f6nig; Yiren Zhao; Robert Mullins", "abstract": "Federated Learning (FL) is a powerful technique to train a model on a server with data from several clients in a privacy-preserving manner. FL incurs significant communication costs because it repeatedly transmits the model between the server and clients. Recently proposed algorithms quantize the model parameters to efficiently compress FL communication. We find that dynamic adaptations of the quantization level can boost compression without sacrificing model quality. We introduce DAdaQuant as a doubly-adaptive quantization algorithm that dynamically changes the quantization level across time and different clients. Our experiments show that DAdaQuant consistently improves client$\\rightarrow$server compression, outperforming the strongest non-adaptive baselines by up to $2.8\\times$.", "bibtex": "@InProceedings{pmlr-v162-honig22a,\n title = \t {{DA}da{Q}uant: Doubly-adaptive quantization for communication-efficient Federated Learning},\n author = {H{\\\"o}nig, Robert and Zhao, Yiren and Mullins, Robert},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8852--8866},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/honig22a/honig22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/honig22a.html},\n abstract = \t {Federated Learning (FL) is a powerful technique to train a model on a server with data from several clients in a privacy-preserving manner. FL incurs significant communication costs because it repeatedly transmits the model between the server and clients. Recently proposed algorithms quantize the model parameters to efficiently compress FL communication. We find that dynamic adaptations of the quantization level can boost compression without sacrificing model quality. We introduce DAdaQuant as a doubly-adaptive quantization algorithm that dynamically changes the quantization level across time and different clients. Our experiments show that DAdaQuant consistently improves client$\\rightarrow$server compression, outperforming the strongest non-adaptive baselines by up to $2.8\\times$.}\n}", "pdf": "https://proceedings.mlr.press/v162/honig22a/honig22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/honig22a-supp.zip", "pdf_size": 495900, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13114481689249347847&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, ETH Zurich, Zurich, Switzerland; Department of Computer Science and Technology, University of Cambridge, Cambridge, United Kingdom; Department of Computer Science and Technology, University of Cambridge, Cambridge, United Kingdom", "aff_domain": "ethz.ch; ; ", "email": "ethz.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/honig22a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "ETH Zurich;University of Cambridge", "aff_unique_dep": "Department of Computer Science;Department of Computer Science and Technology", "aff_unique_url": "https://www.ethz.ch;https://www.cam.ac.uk", "aff_unique_abbr": "ETHZ;Cambridge", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Zurich;Cambridge", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Switzerland;United Kingdom" }, { "title": "DNA: Domain Generalization with Diversified Neural Averaging", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16943", "id": "16943", "proceeding": "https://proceedings.mlr.press/v162/chu22a.html", "poster": "/media/PosterPDFs/ICML%202022/9c82c7143c102b71c593d98d96093fde.png?t=1657531092.5606759", "slides": "", "author_site": "Xu Chu, Yujie Jin, Wenwu Zhu, Yasha Wang, Xin Wang, Shanghang Zhang, Hong Mei", "author": "Xu Chu; Yujie Jin; Wenwu Zhu; Yasha Wang; Xin Wang; Shanghang Zhang; Hong Mei", "abstract": "The inaccessibility of the target domain data causes domain generalization (DG) methods prone to forget target discriminative features, and challenges the pervasive theme in existing literature in pursuing a single classifier with an ideal joint risk. In contrast, this paper investigates model misspecification and attempts to bridge DG with classifier ensemble theoretically and methodologically. By introducing a pruned Jensen-Shannon (PJS) loss, we show that the target square-root risk w.r.t. the PJS loss of the $\\rho$-ensemble (the averaged classifier weighted by a quasi-posterior $\\rho$) is bounded by the averaged source square-root risk of the Gibbs classifiers. We derive a tighter bound by enforcing a positive principled diversity measure of the classifiers. We give a PAC-Bayes upper bound on the target square-root risk of the $\\rho$-ensemble. Methodologically, we propose a diversified neural averaging (DNA) method for DG, which optimizes the proposed PAC-Bayes bound approximately. The DNA method samples Gibbs classifiers transversely and longitudinally by simultaneously considering the dropout variational family and optimization trajectory. The $\\rho$-ensemble is approximated by averaging the longitudinal weights in a single run with dropout shut down, ensuring a fast ensemble with low computational overhead. Empirically, the proposed DNA method achieves the state-of-the-art classification performance on standard DG benchmark datasets.", "bibtex": "@InProceedings{pmlr-v162-chu22a,\n title = \t {{DNA}: Domain Generalization with Diversified Neural Averaging},\n author = {Chu, Xu and Jin, Yujie and Zhu, Wenwu and Wang, Yasha and Wang, Xin and Zhang, Shanghang and Mei, Hong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4010--4034},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chu22a/chu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chu22a.html},\n abstract = \t {The inaccessibility of the target domain data causes domain generalization (DG) methods prone to forget target discriminative features, and challenges the pervasive theme in existing literature in pursuing a single classifier with an ideal joint risk. In contrast, this paper investigates model misspecification and attempts to bridge DG with classifier ensemble theoretically and methodologically. By introducing a pruned Jensen-Shannon (PJS) loss, we show that the target square-root risk w.r.t. the PJS loss of the $\\rho$-ensemble (the averaged classifier weighted by a quasi-posterior $\\rho$) is bounded by the averaged source square-root risk of the Gibbs classifiers. We derive a tighter bound by enforcing a positive principled diversity measure of the classifiers. We give a PAC-Bayes upper bound on the target square-root risk of the $\\rho$-ensemble. Methodologically, we propose a diversified neural averaging (DNA) method for DG, which optimizes the proposed PAC-Bayes bound approximately. The DNA method samples Gibbs classifiers transversely and longitudinally by simultaneously considering the dropout variational family and optimization trajectory. The $\\rho$-ensemble is approximated by averaging the longitudinal weights in a single run with dropout shut down, ensuring a fast ensemble with low computational overhead. Empirically, the proposed DNA method achieves the state-of-the-art classification performance on standard DG benchmark datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/chu22a/chu22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chu22a-supp.zip", "pdf_size": 749820, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14531113840433296189&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Tsinghua University; Peking University; Tsinghua University; Peking University; Tsinghua University; Peking University; Peking University", "aff_domain": "tsinghua.edu.cn;pku.edu.cn;tsinghua.edu.cn; ; ; ; ", "email": "tsinghua.edu.cn;pku.edu.cn;tsinghua.edu.cn; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/chu22a.html", "aff_unique_index": "0;1;0;1;0;1;1", "aff_unique_norm": "Tsinghua University;Peking University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "THU;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "DNNR: Differential Nearest Neighbors Regression", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18341", "id": "18341", "proceeding": "https://proceedings.mlr.press/v162/nader22a.html", "poster": "/media/PosterPDFs/ICML%202022/b887d8d5e65ac4dec3934028fe23ad72.png?t=1657809301.1589122", "slides": "", "author_site": "Youssef Nader, Leon Sixt, Tim Landgraf", "author": "Youssef Nader; Leon Sixt; Tim Landgraf", "abstract": "K-nearest neighbors (KNN) is one of the earliest and most established algorithms in machine learning. For regression tasks, KNN averages the targets within a neighborhood which poses a number of challenges: the neighborhood definition is crucial for the predictive performance as neighbors might be selected based on uninformative features, and averaging does not account for how the function changes locally. We propose a novel method called Differential Nearest Neighbors Regression (DNNR) that addresses both issues simultaneously: during training, DNNR estimates local gradients to scale the features; during inference, it performs an n-th order Taylor approximation using estimated gradients. In a large-scale evaluation on over 250 datasets, we find that DNNR performs comparably to state-of-the-art gradient boosting methods and MLPs while maintaining the simplicity and transparency of KNN. This allows us to derive theoretical error bounds and inspect failures. In times that call for transparency of ML models, DNNR provides a good balance between performance and interpretability.", "bibtex": "@InProceedings{pmlr-v162-nader22a,\n title = \t {{DNNR}: Differential Nearest Neighbors Regression},\n author = {Nader, Youssef and Sixt, Leon and Landgraf, Tim},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16296--16317},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nader22a/nader22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nader22a.html},\n abstract = \t {K-nearest neighbors (KNN) is one of the earliest and most established algorithms in machine learning. For regression tasks, KNN averages the targets within a neighborhood which poses a number of challenges: the neighborhood definition is crucial for the predictive performance as neighbors might be selected based on uninformative features, and averaging does not account for how the function changes locally. We propose a novel method called Differential Nearest Neighbors Regression (DNNR) that addresses both issues simultaneously: during training, DNNR estimates local gradients to scale the features; during inference, it performs an n-th order Taylor approximation using estimated gradients. In a large-scale evaluation on over 250 datasets, we find that DNNR performs comparably to state-of-the-art gradient boosting methods and MLPs while maintaining the simplicity and transparency of KNN. This allows us to derive theoretical error bounds and inspect failures. In times that call for transparency of ML models, DNNR provides a good balance between performance and interpretability.}\n}", "pdf": "https://proceedings.mlr.press/v162/nader22a/nader22a.pdf", "supp": "", "pdf_size": 2764781, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17277642609325925878&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science, Freie Universit\u00e4t Berlin, Germany; Department of Computer Science, Freie Universit\u00e4t Berlin, Germany; Department of Computer Science, Freie Universit\u00e4t Berlin, Germany", "aff_domain": "fu-berlin.de;fu-berlin.de;fu-berlin.de", "email": "fu-berlin.de;fu-berlin.de;fu-berlin.de", "github": "https://github.com/younader/DNNR_paper_codeanalysis", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/nader22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Freie Universit\u00e4t Berlin", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.fu-berlin.de", "aff_unique_abbr": "FU Berlin", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "DNS: Determinantal Point Process Based Neural Network Sampler for Ensemble Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17375", "id": "17375", "proceeding": "https://proceedings.mlr.press/v162/sheikh22a.html", "poster": "/media/PosterPDFs/ICML%202022/4e6cd95227cb0c280e99a195be5f6615.png?t=1657744890.4623365", "slides": "", "author_site": "Hassam Sheikh, Kizza Nandyose Frisbee, mariano phielipp", "author": "Hassam Sheikh; Kizza Frisbee; Mariano Phielipp", "abstract": "The application of an ensemble of neural networks is becoming an imminent tool for advancing state-of-the-art deep reinforcement learning algorithms. However, training these large numbers of neural networks in the ensemble has an exceedingly high computation cost which may become a hindrance in training large-scale systems. In this paper, we propose DNS: a Determinantal Point Process based Neural Network Sampler that specifically uses k-DPP to sample a subset of neural networks for backpropagation at every training step thus significantly reducing the training time and computation cost. We integrated DNS in REDQ for continuous control tasks and evaluated on MuJoCo environments. Our experiments show that DNS augmented REDQ matches the baseline REDQ in terms of average cumulative reward and achieves this using less than 50% computation when measured in FLOPS. The code is available at https://github.com/IntelLabs/DNS", "bibtex": "@InProceedings{pmlr-v162-sheikh22a,\n title = \t {{DNS}: Determinantal Point Process Based Neural Network Sampler for Ensemble Reinforcement Learning},\n author = {Sheikh, Hassam and Frisbee, Kizza and Phielipp, Mariano},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19731--19746},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sheikh22a/sheikh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sheikh22a.html},\n abstract = \t {The application of an ensemble of neural networks is becoming an imminent tool for advancing state-of-the-art deep reinforcement learning algorithms. However, training these large numbers of neural networks in the ensemble has an exceedingly high computation cost which may become a hindrance in training large-scale systems. In this paper, we propose DNS: a Determinantal Point Process based Neural Network Sampler that specifically uses k-DPP to sample a subset of neural networks for backpropagation at every training step thus significantly reducing the training time and computation cost. We integrated DNS in REDQ for continuous control tasks and evaluated on MuJoCo environments. Our experiments show that DNS augmented REDQ matches the baseline REDQ in terms of average cumulative reward and achieves this using less than 50% computation when measured in FLOPS. The code is available at https://github.com/IntelLabs/DNS}\n}", "pdf": "https://proceedings.mlr.press/v162/sheikh22a/sheikh22a.pdf", "supp": "", "pdf_size": 1653920, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16987143666282140914&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": ";;", "aff_domain": ";;", "email": ";;", "github": "https://github.com/IntelLabs/DNS", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sheikh22a.html" }, { "title": "DRAGONN: Distributed Randomized Approximate Gradients of Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16701", "id": "16701", "proceeding": "https://proceedings.mlr.press/v162/wang22aj.html", "poster": "/media/PosterPDFs/ICML%202022/3a24b25a7b092a252166a1641ae953e7.png?t=1657857258.2057712", "slides": "", "author_site": "Zhuang Wang, Zhaozhuo Xu, Xinyu Wu, Anshumali Shrivastava, T. S. Eugene Ng", "author": "Zhuang Wang; Zhaozhuo Xu; Xinyu Wu; Anshumali Shrivastava; T. S. Eugene Ng", "abstract": "Data-parallel distributed training (DDT) has become the de-facto standard for accelerating the training of most deep learning tasks on massively parallel hardware. In the DDT paradigm, the communication overhead of gradient synchronization is the major efficiency bottleneck. A widely adopted approach to tackle this issue is gradient sparsification (GS). However, the current GS methods introduce significant new overhead in compressing the gradients, outweighing the communication overhead and becoming the new efficiency bottleneck. In this paper, we propose DRAGONN, a randomized hashing algorithm for GS in DDT. DRAGONN can significantly reduce the compression time by up to 70% compared to state-of-the-art GS approaches, and achieve up to 3.52x speedup in total training throughput.", "bibtex": "@InProceedings{pmlr-v162-wang22aj,\n title = \t {{DRAGONN}: Distributed Randomized Approximate Gradients of Neural Networks},\n author = {Wang, Zhuang and Xu, Zhaozhuo and Wu, Xinyu and Shrivastava, Anshumali and Ng, T. S. Eugene},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23274--23291},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22aj/wang22aj.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22aj.html},\n abstract = \t {Data-parallel distributed training (DDT) has become the de-facto standard for accelerating the training of most deep learning tasks on massively parallel hardware. In the DDT paradigm, the communication overhead of gradient synchronization is the major efficiency bottleneck. A widely adopted approach to tackle this issue is gradient sparsification (GS). However, the current GS methods introduce significant new overhead in compressing the gradients, outweighing the communication overhead and becoming the new efficiency bottleneck. In this paper, we propose DRAGONN, a randomized hashing algorithm for GS in DDT. DRAGONN can significantly reduce the compression time by up to 70% compared to state-of-the-art GS approaches, and achieve up to 3.52x speedup in total training throughput.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22aj/wang22aj.pdf", "supp": "", "pdf_size": 1454677, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4503258239663504220&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Computer Science Department, Rice University, Houston, TX, USA+ThirdAI Corp, Houston, TX, USA; Computer Science Department, Rice University, Houston, TX, USA+ThirdAI Corp, Houston, TX, USA; Computer Science Department, Rice University, Houston, TX, USA; Computer Science Department, Rice University, Houston, TX, USA+ThirdAI Corp, Houston, TX, USA; Computer Science Department, Rice University, Houston, TX, USA", "aff_domain": "rice.edu;rice.edu;rice.edu;rice.edu;cs.rice.edu", "email": "rice.edu;rice.edu;rice.edu;rice.edu;cs.rice.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/wang22aj.html", "aff_unique_index": "0+1;0+1;0;0+1;0", "aff_unique_norm": "Rice University;ThirdAI Corp", "aff_unique_dep": "Computer Science Department;", "aff_unique_url": "https://www.rice.edu;", "aff_unique_abbr": "Rice;", "aff_campus_unique_index": "0+0;0+0;0;0+0;0", "aff_campus_unique": "Houston", "aff_country_unique_index": "0+0;0+0;0;0+0;0", "aff_country_unique": "United States" }, { "title": "DRIBO: Robust Deep Reinforcement Learning via Multi-View Information Bottleneck", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16441", "id": "16441", "proceeding": "https://proceedings.mlr.press/v162/fan22b.html", "poster": "/media/PosterPDFs/ICML%202022/75b9b6dc7fe44437c6e0a69fd863dbab_kUQGs8S.png?t=1658411444.0144582", "slides": "/media/icml-2022/Slides/16441.pdf", "author_site": "Jiameng Fan, Wenchao Li", "author": "Jiameng Fan; Wenchao Li", "abstract": "Deep reinforcement learning (DRL) agents are often sensitive to visual changes that were unseen in their training environments. To address this problem, we leverage the sequential nature of RL to learn robust representations that encode only task-relevant information from observations based on the unsupervised multi-view setting. Specif- ically, we introduce a novel contrastive version of the Multi-View Information Bottleneck (MIB) objective for temporal data. We train RL agents from pixels with this auxiliary objective to learn robust representations that can compress away task-irrelevant information and are predictive of task-relevant dynamics. This approach enables us to train high-performance policies that are robust to visual distractions and can generalize well to unseen environments. We demonstrate that our approach can achieve SOTA performance on a di- verse set of visual control tasks in the DeepMind Control Suite when the background is replaced with natural videos. In addition, we show that our approach outperforms well-established base- lines for generalization to unseen environments on the Procgen benchmark. Our code is open- sourced and available at https://github. com/BU-DEPEND-Lab/DRIBO.", "bibtex": "@InProceedings{pmlr-v162-fan22b,\n title = \t {{DRIBO}: Robust Deep Reinforcement Learning via Multi-View Information Bottleneck},\n author = {Fan, Jiameng and Li, Wenchao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6074--6102},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fan22b/fan22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/fan22b.html},\n abstract = \t {Deep reinforcement learning (DRL) agents are often sensitive to visual changes that were unseen in their training environments. To address this problem, we leverage the sequential nature of RL to learn robust representations that encode only task-relevant information from observations based on the unsupervised multi-view setting. Specif- ically, we introduce a novel contrastive version of the Multi-View Information Bottleneck (MIB) objective for temporal data. We train RL agents from pixels with this auxiliary objective to learn robust representations that can compress away task-irrelevant information and are predictive of task-relevant dynamics. This approach enables us to train high-performance policies that are robust to visual distractions and can generalize well to unseen environments. We demonstrate that our approach can achieve SOTA performance on a di- verse set of visual control tasks in the DeepMind Control Suite when the background is replaced with natural videos. In addition, we show that our approach outperforms well-established base- lines for generalization to unseen environments on the Procgen benchmark. Our code is open- sourced and available at https://github. com/BU-DEPEND-Lab/DRIBO.}\n}", "pdf": "https://proceedings.mlr.press/v162/fan22b/fan22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/fan22b-supp.zip", "pdf_size": 5471909, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17795910493641193453&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Electrical and Computer Engineer, Boston University, Boston, Massachusetts, USA; Department of Electrical and Computer Engineer, Boston University, Boston, Massachusetts, USA", "aff_domain": "bu.edu; ", "email": "bu.edu; ", "github": "https://github.com/BU-DEPEND-Lab/DRIBO", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/fan22b.html", "aff_unique_index": "0;0", "aff_unique_norm": "Boston University", "aff_unique_dep": "Department of Electrical and Computer Engineer", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Boston", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "DSTAGNN: Dynamic Spatial-Temporal Aware Graph Neural Network for Traffic Flow Forecasting", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16007", "id": "16007", "proceeding": "https://proceedings.mlr.press/v162/lan22a.html", "poster": "/media/PosterPDFs/ICML%202022/67f7fb873eaf29526a11a9b7ac33bfac_pD4mrt3.png?t=1655262620.0325332", "slides": "/media/icml-2022/Slides/16007_u1jifE0.pdf", "author_site": "Shiyong Lan, Yitong Ma, Weikang Huang, Wenwu Wang, Hongyu Yang, pyang li", "author": "Shiyong Lan; Yitong Ma; Weikang Huang; Wenwu Wang; Hongyu Yang; Pyang Li", "abstract": "As a typical problem in time series analysis, traffic flow prediction is one of the most important application fields of machine learning. However, achieving highly accurate traffic flow prediction is a challenging task, due to the presence of complex dynamic spatial-temporal dependencies within a road network. This paper proposes a novel Dynamic Spatial-Temporal Aware Graph Neural Network (DSTAGNN) to model the complex spatial-temporal interaction in road network. First, considering the fact that historical data carries intrinsic dynamic information about the spatial structure of road networks, we propose a new dynamic spatial-temporal aware graph based on a data-driven strategy to replace the pre-defined static graph usually used in traditional graph convolution. Second, we design a novel graph neural network architecture, which can not only represent dynamic spatial relevance among nodes with an improved multi-head attention mechanism, but also acquire the wide range of dynamic temporal dependency from multi-receptive field features via multi-scale gated convolution. Extensive experiments on real-world data sets demonstrate that our proposed method significantly outperforms the state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v162-lan22a,\n title = \t {{DSTAGNN}: Dynamic Spatial-Temporal Aware Graph Neural Network for Traffic Flow Forecasting},\n author = {Lan, Shiyong and Ma, Yitong and Huang, Weikang and Wang, Wenwu and Yang, Hongyu and Li, Pyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11906--11917},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lan22a/lan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lan22a.html},\n abstract = \t {As a typical problem in time series analysis, traffic flow prediction is one of the most important application fields of machine learning. However, achieving highly accurate traffic flow prediction is a challenging task, due to the presence of complex dynamic spatial-temporal dependencies within a road network. This paper proposes a novel Dynamic Spatial-Temporal Aware Graph Neural Network (DSTAGNN) to model the complex spatial-temporal interaction in road network. First, considering the fact that historical data carries intrinsic dynamic information about the spatial structure of road networks, we propose a new dynamic spatial-temporal aware graph based on a data-driven strategy to replace the pre-defined static graph usually used in traditional graph convolution. Second, we design a novel graph neural network architecture, which can not only represent dynamic spatial relevance among nodes with an improved multi-head attention mechanism, but also acquire the wide range of dynamic temporal dependency from multi-receptive field features via multi-scale gated convolution. Extensive experiments on real-world data sets demonstrate that our proposed method significantly outperforms the state-of-the-art methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/lan22a/lan22a.pdf", "supp": "", "pdf_size": 3608212, "gs_citation": 441, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6179934645315538129&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "College of Computer Science, Sichuan University, Chengdu, China; College of Computer Science, Sichuan University, Chengdu, China; National Key Laboratory of Fundamental Science on Synthetic Vision, Sichuan University, Chengdu, China; Department of Electrical and Electronic Engineering, University of Surrey, Guildford, UK; College of Computer Science, Sichuan University, Chengdu, China; College of Computer Science, Sichuan University, Chengdu, China", "aff_domain": "scu.edu.cn; ; ; ; ; ", "email": "scu.edu.cn; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/lan22a.html", "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Sichuan University;University of Surrey", "aff_unique_dep": "College of Computer Science;Department of Electrical and Electronic Engineering", "aff_unique_url": "https://www.scu.edu.cn;https://www.surrey.ac.uk", "aff_unique_abbr": "SCU;Surrey", "aff_campus_unique_index": "0;0;0;1;0;0", "aff_campus_unique": "Chengdu;Guildford", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "China;United Kingdom" }, { "title": "Data Augmentation as Feature Manipulation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17621", "id": "17621", "proceeding": "https://proceedings.mlr.press/v162/shen22a.html", "poster": "/media/PosterPDFs/ICML%202022/cb3ce9b06932da6faaa7fc70d5b5d2f4.png?t=1657917382.4192274", "slides": "", "author_site": "Ruoqi Shen, Sebastien Bubeck, Suriya Gunasekar", "author": "Ruoqi Shen; Sebastien Bubeck; Suriya Gunasekar", "abstract": "Data augmentation is a cornerstone of the machine learning pipeline, yet its theoretical underpinnings remain unclear. Is it merely a way to artificially augment the data set size? Or is it about encouraging the model to satisfy certain invariances? In this work we consider another angle, and we study the effect of data augmentation on the dynamic of the learning process. We find that data augmentation can alter the relative importance of various features, effectively making certain informative but hard to learn features more likely to be captured in the learning process. Importantly, we show that this effect is more pronounced for non-linear models, such as neural networks. Our main contribution is a detailed analysis of data augmentation on the learning dynamic for a two layer convolutional neural network in the recently proposed multi-view model by Z. Allen-Zhu and Y. Li. We complement this analysis with further experimental evidence that data augmentation can be viewed as a form of feature manipulation.", "bibtex": "@InProceedings{pmlr-v162-shen22a,\n title = \t {Data Augmentation as Feature Manipulation},\n author = {Shen, Ruoqi and Bubeck, Sebastien and Gunasekar, Suriya},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19773--19808},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shen22a/shen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/shen22a.html},\n abstract = \t {Data augmentation is a cornerstone of the machine learning pipeline, yet its theoretical underpinnings remain unclear. Is it merely a way to artificially augment the data set size? Or is it about encouraging the model to satisfy certain invariances? In this work we consider another angle, and we study the effect of data augmentation on the dynamic of the learning process. We find that data augmentation can alter the relative importance of various features, effectively making certain informative but hard to learn features more likely to be captured in the learning process. Importantly, we show that this effect is more pronounced for non-linear models, such as neural networks. Our main contribution is a detailed analysis of data augmentation on the learning dynamic for a two layer convolutional neural network in the recently proposed multi-view model by Z. Allen-Zhu and Y. Li. We complement this analysis with further experimental evidence that data augmentation can be viewed as a form of feature manipulation.}\n}", "pdf": "https://proceedings.mlr.press/v162/shen22a/shen22a.pdf", "supp": "", "pdf_size": 609289, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8511009105279544089&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "University of Washington + Microsoft Research; Microsoft Research; Microsoft Research", "aff_domain": "cs.washington.edu; ; ", "email": "cs.washington.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/shen22a.html", "aff_unique_index": "0+1;1;1", "aff_unique_norm": "University of Washington;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.washington.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UW;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "Data Determines Distributional Robustness in Contrastive Language Image Pre-training (CLIP)", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18149", "id": "18149", "proceeding": "https://proceedings.mlr.press/v162/fang22a.html", "poster": "/media/PosterPDFs/ICML%202022/2b8dec786a8cdf5889309995733b54d4.png?t=1657848423.1607592", "slides": "", "author_site": "Alex Fang, Gabriel Ilharco, Mitchell Wortsman, Yuhao Wan, Vaishaal Shankar, Achal Dave, Ludwig Schmidt", "author": "Alex Fang; Gabriel Ilharco; Mitchell Wortsman; Yuhao Wan; Vaishaal Shankar; Achal Dave; Ludwig Schmidt", "abstract": "Contrastively trained language-image models such as CLIP, ALIGN, and BASIC have demonstrated unprecedented robustness to multiple challenging natural distribution shifts. Since these language-image models differ from previous training approaches in several ways, an important question is what causes the large robustness gains. We answer this question via a systematic experimental investigation. Concretely, we study five different possible causes for the robustness gains: (i) the training set size, (ii) the training distribution, (iii) language supervision at training time, (iv) language supervision at test time, and (v) the contrastive loss function. Our experiments show that the more diverse training distribution is the main cause for the robustness gains, with the other factors contributing little to no robustness. Beyond our experimental results, we also introduce ImageNet-Captions, a version of ImageNet with original text annotations from Flickr, to enable further controlled experiments of language-image training.", "bibtex": "@InProceedings{pmlr-v162-fang22a,\n title = \t {Data Determines Distributional Robustness in Contrastive Language Image Pre-training ({CLIP})},\n author = {Fang, Alex and Ilharco, Gabriel and Wortsman, Mitchell and Wan, Yuhao and Shankar, Vaishaal and Dave, Achal and Schmidt, Ludwig},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6216--6234},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fang22a/fang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/fang22a.html},\n abstract = \t {Contrastively trained language-image models such as CLIP, ALIGN, and BASIC have demonstrated unprecedented robustness to multiple challenging natural distribution shifts. Since these language-image models differ from previous training approaches in several ways, an important question is what causes the large robustness gains. We answer this question via a systematic experimental investigation. Concretely, we study five different possible causes for the robustness gains: (i) the training set size, (ii) the training distribution, (iii) language supervision at training time, (iv) language supervision at test time, and (v) the contrastive loss function. Our experiments show that the more diverse training distribution is the main cause for the robustness gains, with the other factors contributing little to no robustness. Beyond our experimental results, we also introduce ImageNet-Captions, a version of ImageNet with original text annotations from Flickr, to enable further controlled experiments of language-image training.}\n}", "pdf": "https://proceedings.mlr.press/v162/fang22a/fang22a.pdf", "supp": "", "pdf_size": 1668268, "gs_citation": 165, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12568254041342889008&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "University of Washington; University of Washington; University of Washington; University of Washington; Amazon; Amazon; Allen Institute for Artificial Intelligence + University of Washington", "aff_domain": "cs.washington.edu; ; ; ; ; ; ", "email": "cs.washington.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/fang22a.html", "aff_unique_index": "0;0;0;0;1;1;2+0", "aff_unique_norm": "University of Washington;Amazon;Allen Institute for Artificial Intelligence", "aff_unique_dep": ";Amazon.com, Inc.;", "aff_unique_url": "https://www.washington.edu;https://www.amazon.com;https://allenai.org", "aff_unique_abbr": "UW;Amazon;AI2", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "Data Scaling Laws in NMT: The Effect of Noise and Architecture", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17567", "id": "17567", "proceeding": "https://proceedings.mlr.press/v162/bansal22b.html", "poster": "/media/PosterPDFs/ICML%202022/8a146f1a3da4700cbf03cdc55e2daae6.png?t=1658234276.0195978", "slides": "", "author_site": "Yamini Bansal, Behrooz Ghorbani, Ankush Garg, Biao Zhang, Colin Cherry, Behnam Neyshabur, Orhan Firat", "author": "Yamini Bansal; Behrooz Ghorbani; Ankush Garg; Biao Zhang; Colin Cherry; Behnam Neyshabur; Orhan Firat", "abstract": "In this work, we study the effect of varying the architecture and training data quality on the data scaling properties of Neural Machine Translation (NMT). First, we establish that the test loss of encoder-decoder transformer models scales as a power law in the number of training samples, with a dependence on the model size. Then, we systematically vary aspects of the training setup to understand how they impact the data scaling laws. In particular, we change the following (1) Architecture and task setup: We compare to a transformer-LSTM hybrid, and a decoder-only transformer with a language modeling loss (2) Noise level in the training distribution: We experiment with filtering, and adding iid synthetic noise. In all the above cases, we find that the data scaling exponents are minimally impacted, suggesting that marginally worse architectures or training data can be compensated for by adding more data. Lastly, we find that using back-translated data instead of parallel data, can significantly degrade the scaling exponent.", "bibtex": "@InProceedings{pmlr-v162-bansal22b,\n title = \t {Data Scaling Laws in {NMT}: The Effect of Noise and Architecture},\n author = {Bansal, Yamini and Ghorbani, Behrooz and Garg, Ankush and Zhang, Biao and Cherry, Colin and Neyshabur, Behnam and Firat, Orhan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1466--1482},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bansal22b/bansal22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/bansal22b.html},\n abstract = \t {In this work, we study the effect of varying the architecture and training data quality on the data scaling properties of Neural Machine Translation (NMT). First, we establish that the test loss of encoder-decoder transformer models scales as a power law in the number of training samples, with a dependence on the model size. Then, we systematically vary aspects of the training setup to understand how they impact the data scaling laws. In particular, we change the following (1) Architecture and task setup: We compare to a transformer-LSTM hybrid, and a decoder-only transformer with a language modeling loss (2) Noise level in the training distribution: We experiment with filtering, and adding iid synthetic noise. In all the above cases, we find that the data scaling exponents are minimally impacted, suggesting that marginally worse architectures or training data can be compensated for by adding more data. Lastly, we find that using back-translated data instead of parallel data, can significantly degrade the scaling exponent.}\n}", "pdf": "https://proceedings.mlr.press/v162/bansal22b/bansal22b.pdf", "supp": "", "pdf_size": 2751116, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13927416013286476769&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/bansal22b.html" }, { "title": "Data-Efficient Double-Win Lottery Tickets from Robust Pre-training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16055", "id": "16055", "proceeding": "https://proceedings.mlr.press/v162/chen22ae.html", "poster": "/media/PosterPDFs/ICML%202022/e562cd9c0768d5464b64cf61da7fc6bb.png?t=1657516443.597234", "slides": "", "author_site": "Tianlong Chen, Zhenyu Zhang, Sijia Liu, Yang Zhang, Shiyu Chang, Zhangyang \u201cAtlas\u201d Wang", "author": "Tianlong Chen; Zhenyu Zhang; Sijia Liu; Yang Zhang; Shiyu Chang; Zhangyang Wang", "abstract": "Pre-training serves as a broadly adopted starting point for transfer learning on various downstream tasks. Recent investigations of lottery tickets hypothesis (LTH) demonstrate such enormous pre-trained models can be replaced by extremely sparse subnetworks (a.k.a. matching subnetworks) without sacrificing transferability. However, practical security-crucial applications usually pose more challenging requirements beyond standard transfer, which also demand these subnetworks to overcome adversarial vulnerability. In this paper, we formulate a more rigorous concept, Double-Win Lottery Tickets, in which a located subnetwork from a pre-trained model can be independently transferred on diverse downstream tasks, to reach BOTH the same standard and robust generalization, under BOTH standard and adversarial training regimes, as the full pre-trained model can do. We comprehensively examine various pre-training mechanisms and find that robust pre-training tends to craft sparser double-win lottery tickets with superior performance over the standard counterparts. For example, on downstream CIFAR-10/100 datasets, we identify double-win matching subnetworks with the standard, fast adversarial, and adversarial pre-training from ImageNet, at 89.26%/73.79%, 89.26%/79.03%, and 91.41%/83.22% sparsity, respectively. Furthermore, we observe the obtained double-win lottery tickets can be more data-efficient to transfer, under practical data-limited (e.g., 1% and 10%) downstream schemes. Our results show that the benefits from robust pre-training are amplified by the lottery ticket scheme, as well as the data-limited transfer setting. Codes are available at https://github.com/VITA-Group/Double-Win-LTH.", "bibtex": "@InProceedings{pmlr-v162-chen22ae,\n title = \t {Data-Efficient Double-Win Lottery Tickets from Robust Pre-training},\n author = {Chen, Tianlong and Zhang, Zhenyu and Liu, Sijia and Zhang, Yang and Chang, Shiyu and Wang, Zhangyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3747--3759},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22ae/chen22ae.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22ae.html},\n abstract = \t {Pre-training serves as a broadly adopted starting point for transfer learning on various downstream tasks. Recent investigations of lottery tickets hypothesis (LTH) demonstrate such enormous pre-trained models can be replaced by extremely sparse subnetworks (a.k.a. matching subnetworks) without sacrificing transferability. However, practical security-crucial applications usually pose more challenging requirements beyond standard transfer, which also demand these subnetworks to overcome adversarial vulnerability. In this paper, we formulate a more rigorous concept, Double-Win Lottery Tickets, in which a located subnetwork from a pre-trained model can be independently transferred on diverse downstream tasks, to reach BOTH the same standard and robust generalization, under BOTH standard and adversarial training regimes, as the full pre-trained model can do. We comprehensively examine various pre-training mechanisms and find that robust pre-training tends to craft sparser double-win lottery tickets with superior performance over the standard counterparts. For example, on downstream CIFAR-10/100 datasets, we identify double-win matching subnetworks with the standard, fast adversarial, and adversarial pre-training from ImageNet, at 89.26%/73.79%, 89.26%/79.03%, and 91.41%/83.22% sparsity, respectively. Furthermore, we observe the obtained double-win lottery tickets can be more data-efficient to transfer, under practical data-limited (e.g., 1% and 10%) downstream schemes. Our results show that the benefits from robust pre-training are amplified by the lottery ticket scheme, as well as the data-limited transfer setting. Codes are available at https://github.com/VITA-Group/Double-Win-LTH.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22ae/chen22ae.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chen22ae-supp.zip", "pdf_size": 2015472, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2999471991915534947&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "https://github.com/VITA-Group/Double-Win-LTH", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/chen22ae.html" }, { "title": "Data-SUITE: Data-centric identification of in-distribution incongruous examples", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17079", "id": "17079", "proceeding": "https://proceedings.mlr.press/v162/seedat22a.html", "poster": "/media/PosterPDFs/ICML%202022/b7046757c3682a28c5bf2024e57678a0.png?t=1657821716.0099711", "slides": "", "author_site": "Nabeel Seedat, Jonathan Crabb\u00e9, Mihaela van der Schaar", "author": "Nabeel Seedat; Jonathan Crabb\u00e9; Mihaela van der Schaar", "abstract": "Systematic quantification of data quality is critical for consistent model performance. Prior works have focused on out-of-distribution data. Instead, we tackle an understudied yet equally important problem of characterizing incongruous regions of in-distribution (ID) data, which may arise from feature space heterogeneity. To this end, we propose a paradigm shift with Data-SUITE: a data-centric AI framework to identify these regions, independent of a task-specific model. Data-SUITE leverages copula modeling, representation learning, and conformal prediction to build feature-wise confidence interval estimators based on a set of training instances. These estimators can be used to evaluate the congruence of test instances with respect to the training set, to answer two practically useful questions: (1) which test instances will be reliably predicted by a model trained with the training instances? and (2) can we identify incongruous regions of the feature space so that data owners understand the data\u2019s limitations or guide future data collection? We empirically validate Data-SUITE\u2019s performance and coverage guarantees and demonstrate on cross-site medical data, biased data, and data with concept drift, that Data-SUITE best identifies ID regions where a downstream model may be reliable (independent of said model). We also illustrate how these identified regions can provide insights into datasets and highlight their limitations.", "bibtex": "@InProceedings{pmlr-v162-seedat22a,\n title = \t {Data-{SUITE}: Data-centric identification of in-distribution incongruous examples},\n author = {Seedat, Nabeel and Crabb{\\'e}, Jonathan and van der Schaar, Mihaela},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19467--19496},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/seedat22a/seedat22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/seedat22a.html},\n abstract = \t {Systematic quantification of data quality is critical for consistent model performance. Prior works have focused on out-of-distribution data. Instead, we tackle an understudied yet equally important problem of characterizing incongruous regions of in-distribution (ID) data, which may arise from feature space heterogeneity. To this end, we propose a paradigm shift with Data-SUITE: a data-centric AI framework to identify these regions, independent of a task-specific model. Data-SUITE leverages copula modeling, representation learning, and conformal prediction to build feature-wise confidence interval estimators based on a set of training instances. These estimators can be used to evaluate the congruence of test instances with respect to the training set, to answer two practically useful questions: (1) which test instances will be reliably predicted by a model trained with the training instances? and (2) can we identify incongruous regions of the feature space so that data owners understand the data\u2019s limitations or guide future data collection? We empirically validate Data-SUITE\u2019s performance and coverage guarantees and demonstrate on cross-site medical data, biased data, and data with concept drift, that Data-SUITE best identifies ID regions where a downstream model may be reliable (independent of said model). We also illustrate how these identified regions can provide insights into datasets and highlight their limitations.}\n}", "pdf": "https://proceedings.mlr.press/v162/seedat22a/seedat22a.pdf", "supp": "", "pdf_size": 2770914, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11485689307897239676&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Applied Mathematics and Theoretical Physics, University of Cambridge, UK+The Alan Turing Institute, London, UK+University of California, Los Angeles, USA; Department of Applied Mathematics and Theoretical Physics, University of Cambridge, UK+The Alan Turing Institute, London, UK+University of California, Los Angeles, USA; Department of Applied Mathematics and Theoretical Physics, University of Cambridge, UK+The Alan Turing Institute, London, UK+University of California, Los Angeles, USA", "aff_domain": "cam.ac.uk; ; ", "email": "cam.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/seedat22a.html", "aff_unique_index": "0+1+2;0+1+2;0+1+2", "aff_unique_norm": "University of Cambridge;Alan Turing Institute;University of California, Los Angeles", "aff_unique_dep": "Department of Applied Mathematics and Theoretical Physics;;", "aff_unique_url": "https://www.cam.ac.uk;https://www.turing.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "Cambridge;ATI;UCLA", "aff_campus_unique_index": "0+1+2;0+1+2;0+1+2", "aff_campus_unique": "Cambridge;London;Los Angeles", "aff_country_unique_index": "0+0+1;0+0+1;0+0+1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Datamodels: Understanding Predictions with Data and Data with Predictions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16971", "id": "16971", "proceeding": "https://proceedings.mlr.press/v162/ilyas22a.html", "poster": "/media/PosterPDFs/ICML%202022/24b43fb034a10d78bec71274033b4096.png?t=1658026775.0937712", "slides": "", "author_site": "Andrew Ilyas, Sung Min (Sam) Park, Logan Engstrom, Guillaume Leclerc, Aleksander Madry", "author": "Andrew Ilyas; Sung Min Park; Logan Engstrom; Guillaume Leclerc; Aleksander Madry", "abstract": "We present a conceptual framework,", "bibtex": "@InProceedings{pmlr-v162-ilyas22a,\n title = \t {Datamodels: Understanding Predictions with Data and Data with Predictions},\n author = {Ilyas, Andrew and Park, Sung Min and Engstrom, Logan and Leclerc, Guillaume and Madry, Aleksander},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9525--9587},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ilyas22a/ilyas22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ilyas22a.html},\n abstract = \t {We present a conceptual framework,", "pdf": "https://proceedings.mlr.press/v162/ilyas22a/ilyas22a.pdf", "supp": "", "pdf_size": 20174872, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9873696963458193013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "MIT; MIT; MIT; MIT; MIT", "aff_domain": "mit.edu; ; ; ; ", "email": "mit.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/ilyas22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Dataset Condensation via Efficient Synthetic-Data Parameterization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17927", "id": "17927", "proceeding": "https://proceedings.mlr.press/v162/kim22c.html", "poster": "/media/PosterPDFs/ICML%202022/c57168a952f5d46724cf35dfc3d48a7f.png?t=1657168023.0703778", "slides": "", "author_site": "Jang-Hyun Kim, Jinuk Kim, Seong Joon Oh, Sangdoo Yun, Hwanjun Song, Joonhyun Jeong, Jung-Woo Ha, Hyun Oh Song", "author": "Jang-Hyun Kim; Jinuk Kim; Seong Joon Oh; Sangdoo Yun; Hwanjun Song; Joonhyun Jeong; Jung-Woo Ha; Hyun Oh Song", "abstract": "The great success of machine learning with massive amounts of data comes at a price of huge computation costs and storage for training and tuning. Recent studies on dataset condensation attempt to reduce the dependence on such massive data by synthesizing a compact training dataset. However, the existing approaches have fundamental limitations in optimization due to the limited representability of synthetic datasets without considering any data regularity characteristics. To this end, we propose a novel condensation framework that generates multiple synthetic data with a limited storage budget via efficient parameterization considering data regularity. We further analyze the shortcomings of the existing gradient matching-based condensation methods and develop an effective optimization technique for improving the condensation of training data information. We propose a unified algorithm that drastically improves the quality of condensed data against the current state-of-the-art on CIFAR-10, ImageNet, and Speech Commands.", "bibtex": "@InProceedings{pmlr-v162-kim22c,\n title = \t {Dataset Condensation via Efficient Synthetic-Data Parameterization},\n author = {Kim, Jang-Hyun and Kim, Jinuk and Oh, Seong Joon and Yun, Sangdoo and Song, Hwanjun and Jeong, Joonhyun and Ha, Jung-Woo and Song, Hyun Oh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11102--11118},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kim22c/kim22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/kim22c.html},\n abstract = \t {The great success of machine learning with massive amounts of data comes at a price of huge computation costs and storage for training and tuning. Recent studies on dataset condensation attempt to reduce the dependence on such massive data by synthesizing a compact training dataset. However, the existing approaches have fundamental limitations in optimization due to the limited representability of synthetic datasets without considering any data regularity characteristics. To this end, we propose a novel condensation framework that generates multiple synthetic data with a limited storage budget via efficient parameterization considering data regularity. We further analyze the shortcomings of the existing gradient matching-based condensation methods and develop an effective optimization technique for improving the condensation of training data information. We propose a unified algorithm that drastically improves the quality of condensed data against the current state-of-the-art on CIFAR-10, ImageNet, and Speech Commands.}\n}", "pdf": "https://proceedings.mlr.press/v162/kim22c/kim22c.pdf", "supp": "", "pdf_size": 5262243, "gs_citation": 207, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13062983297577274052&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science and Engineering, Seoul National University; Department of Computer Science and Engineering, Seoul National University; NAVER AI Lab; NAVER AI Lab; NAVER AI Lab; Image Vision, NAVER Clova; NAVER AI Lab; Department of Computer Science and Engineering, Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;navercorp.com;navercorp.com;navercorp.com;navercorp.com;navercorp.com;snu.ac.kr", "email": "snu.ac.kr;snu.ac.kr;navercorp.com;navercorp.com;navercorp.com;navercorp.com;navercorp.com;snu.ac.kr", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/kim22c.html", "aff_unique_index": "0;0;1;1;1;2;1;0", "aff_unique_norm": "Seoul National University;NAVER Corporation;NAVER Clova", "aff_unique_dep": "Department of Computer Science and Engineering;NAVER AI Lab;Image Vision", "aff_unique_url": "https://www.snu.ac.kr;https://www.naver.com;https://www.naver.com", "aff_unique_abbr": "SNU;NAVER;NAVER", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Dataset Condensation with Contrastive Signals", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18195", "id": "18195", "proceeding": "https://proceedings.mlr.press/v162/lee22b.html", "poster": "/media/PosterPDFs/ICML%202022/1f1baa5b8edac74eb4eaa329f14a0361.png?t=1657181146.2480915", "slides": "", "author_site": "Saehyung Lee, SANGHYUK CHUN, Sangwon Jung, Sangdoo Yun, Sungroh Yoon", "author": "Saehyung Lee; Sanghyuk Chun; Sangwon Jung; Sangdoo Yun; Sungroh Yoon", "abstract": "Recent studies have demonstrated that gradient matching-based dataset synthesis, or dataset condensation (DC), methods can achieve state-of-theart performance when applied to data-efficient learning tasks. However, in this study, we prove that the existing DC methods can perform worse than the random selection method when taskirrelevant information forms a significant part of the training dataset. We attribute this to the lack of participation of the contrastive signals between the classes resulting from the class-wise gradient matching strategy. To address this problem, we propose Dataset Condensation with Contrastive signals (DCC) by modifying the loss function to enable the DC methods to effectively capture the differences between classes. In addition, we analyze the new loss function in terms of training dynamics by tracking the kernel velocity. Furthermore, we introduce a bi-level warm-up strategy to stabilize the optimization. Our experimental results indicate that while the existing methods are ineffective for fine-grained image classification tasks, the proposed method can successfully generate informative synthetic datasets for the same tasks. Moreover, we demonstrate that the proposed method outperforms the baselines even on benchmark datasets such as SVHN, CIFAR-10, and CIFAR-100. Finally, we demonstrate the high applicability of the proposed method by applying it to continual learning tasks.", "bibtex": "@InProceedings{pmlr-v162-lee22b,\n title = \t {Dataset Condensation with Contrastive Signals},\n author = {Lee, Saehyung and Chun, Sanghyuk and Jung, Sangwon and Yun, Sangdoo and Yoon, Sungroh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12352--12364},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lee22b/lee22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/lee22b.html},\n abstract = \t {Recent studies have demonstrated that gradient matching-based dataset synthesis, or dataset condensation (DC), methods can achieve state-of-theart performance when applied to data-efficient learning tasks. However, in this study, we prove that the existing DC methods can perform worse than the random selection method when taskirrelevant information forms a significant part of the training dataset. We attribute this to the lack of participation of the contrastive signals between the classes resulting from the class-wise gradient matching strategy. To address this problem, we propose Dataset Condensation with Contrastive signals (DCC) by modifying the loss function to enable the DC methods to effectively capture the differences between classes. In addition, we analyze the new loss function in terms of training dynamics by tracking the kernel velocity. Furthermore, we introduce a bi-level warm-up strategy to stabilize the optimization. Our experimental results indicate that while the existing methods are ineffective for fine-grained image classification tasks, the proposed method can successfully generate informative synthetic datasets for the same tasks. Moreover, we demonstrate that the proposed method outperforms the baselines even on benchmark datasets such as SVHN, CIFAR-10, and CIFAR-100. Finally, we demonstrate the high applicability of the proposed method by applying it to continual learning tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/lee22b/lee22b.pdf", "supp": "", "pdf_size": 808277, "gs_citation": 139, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7694046388594127798&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Electric and Computer Engineering, Seoul National University + NA VER AI Lab; NA VER AI Lab; Department of Electric and Computer Engineering, Seoul National University + NA VER AI Lab; NA VER AI Lab; Interdisciplinary Program in Artificial Intelligence, Seoul National University + NA VER AI Lab", "aff_domain": "snu.ac.kr; ; ; ;snu.ac.kr", "email": "snu.ac.kr; ; ; ;snu.ac.kr", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/lee22b.html", "aff_unique_index": "0+1;1;0+1;1;0+1", "aff_unique_norm": "Seoul National University;NAVER Corporation", "aff_unique_dep": "Department of Electric and Computer Engineering;AI Lab", "aff_unique_url": "https://www.snu.ac.kr;https://www.naver.com", "aff_unique_abbr": "SNU;NAVER", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0+0;0;0+0;0;0+0", "aff_country_unique": "South Korea" }, { "title": "De novo mass spectrometry peptide sequencing with a transformer model", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16795", "id": "16795", "proceeding": "https://proceedings.mlr.press/v162/yilmaz22a.html", "poster": "/media/PosterPDFs/ICML%202022/f51dc802382ce2b548bf73ff0726a31d.png?t=1657305568.3554509", "slides": "", "author_site": "Melih Yilmaz, William Fondrie, Wout Bittremieux, Sewoong Oh, William Noble", "author": "Melih Yilmaz; William Fondrie; Wout Bittremieux; Sewoong Oh; William S Noble", "abstract": "Tandem mass spectrometry is the only high-throughput method for analyzing the protein content of complex biological samples and is thus the primary technology driving the growth of the field of proteomics. A key outstanding challenge in this field involves identifying the sequence of amino acids -the peptide- responsible for generating each observed spectrum, without making use of prior knowledge in the form of a peptide sequence database. Although various machine learning methods have been developed to address this de novo sequencing problem, challenges that arise when modeling tandem mass spectra have led to complex models that combine multiple neural networks and post-processing steps. We propose a simple yet powerful method for de novo peptide sequencing, Casanovo, that uses a transformer framework to map directly from a sequence of observed peaks (a mass spectrum) to a sequence of amino acids (a peptide). Our experiments show that Casanovo achieves state-of-the-art performance on a benchmark dataset using a standard cross-species evaluation framework which involves testing with spectra with never-before-seen peptide labels. Casanovo not only achieves superior performance but does so at a fraction of the model complexity and inference time required by other methods.", "bibtex": "@InProceedings{pmlr-v162-yilmaz22a,\n title = \t {De novo mass spectrometry peptide sequencing with a transformer model},\n author = {Yilmaz, Melih and Fondrie, William and Bittremieux, Wout and Oh, Sewoong and Noble, William S},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25514--25522},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yilmaz22a/yilmaz22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yilmaz22a.html},\n abstract = \t {Tandem mass spectrometry is the only high-throughput method for analyzing the protein content of complex biological samples and is thus the primary technology driving the growth of the field of proteomics. A key outstanding challenge in this field involves identifying the sequence of amino acids -the peptide- responsible for generating each observed spectrum, without making use of prior knowledge in the form of a peptide sequence database. Although various machine learning methods have been developed to address this de novo sequencing problem, challenges that arise when modeling tandem mass spectra have led to complex models that combine multiple neural networks and post-processing steps. We propose a simple yet powerful method for de novo peptide sequencing, Casanovo, that uses a transformer framework to map directly from a sequence of observed peaks (a mass spectrum) to a sequence of amino acids (a peptide). Our experiments show that Casanovo achieves state-of-the-art performance on a benchmark dataset using a standard cross-species evaluation framework which involves testing with spectra with never-before-seen peptide labels. Casanovo not only achieves superior performance but does so at a fraction of the model complexity and inference time required by other methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/yilmaz22a/yilmaz22a.pdf", "supp": "", "pdf_size": 1789384, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12196130041314246813&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle, WA, USA; Talus Bioscience, Seattle, WA, USA; Skaggs School of Pharmacy and Pharmaceutical Sciences, University of California San Diego, La Jolla, CA, USA; Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle, WA, USA; Department of Genome Sciences, University of Washington, Seattle, WA, USA", "aff_domain": "cs.washington.edu;uw.edu; ; ; ", "email": "cs.washington.edu;uw.edu; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/yilmaz22a.html", "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "University of Washington;Talus Bioscience;University of California, San Diego", "aff_unique_dep": "Paul G. Allen School of Computer Science and Engineering;;Skaggs School of Pharmacy and Pharmaceutical Sciences", "aff_unique_url": "https://www.washington.edu;;https://ucsd.edu", "aff_unique_abbr": "UW;;UCSD", "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Seattle;La Jolla", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Debiaser Beware: Pitfalls of Centering Regularized Transport Maps", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16639", "id": "16639", "proceeding": "https://proceedings.mlr.press/v162/pooladian22a.html", "poster": "/media/PosterPDFs/ICML%202022/020c8bfac8de160d4c5543b96d1fdede.png?t=1658157693.4965777", "slides": "", "author_site": "Aram-Alexandre Pooladian, Marco Cuturi, Jonathan Niles-Weed", "author": "Aram-Alexandre Pooladian; Marco Cuturi; Jonathan Niles-Weed", "abstract": "Estimating optimal transport (OT) maps (a.k.a. Monge maps) between two measures P and Q is a problem fraught with computational and statistical challenges. A promising approach lies in using the dual potential functions obtained when solving an entropy-regularized OT problem between samples P_n and Q_n, which can be used to recover an approximately optimal map. The negentropy penalization in that scheme introduces, however, an estimation bias that grows with the regularization strength. A well-known remedy to debias such estimates, which has gained wide popularity among practitioners of regularized OT, is to center them, by subtracting auxiliary problems involving P_n and itself, as well as Q_n and itself. We do prove that, under favorable conditions on P and Q, debiasing can yield better approximations to the Monge map. However, and perhaps surprisingly, we present a few cases in which debiasing is provably detrimental in a statistical sense, notably when the regularization strength is large or the number of samples is small. These claims are validated experimentally on synthetic and real datasets, and should reopen the debate on whether debiasing is needed when using entropic OT.", "bibtex": "@InProceedings{pmlr-v162-pooladian22a,\n title = \t {Debiaser Beware: Pitfalls of Centering Regularized Transport Maps},\n author = {Pooladian, Aram-Alexandre and Cuturi, Marco and Niles-Weed, Jonathan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17830--17847},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pooladian22a/pooladian22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pooladian22a.html},\n abstract = \t {Estimating optimal transport (OT) maps (a.k.a. Monge maps) between two measures P and Q is a problem fraught with computational and statistical challenges. A promising approach lies in using the dual potential functions obtained when solving an entropy-regularized OT problem between samples P_n and Q_n, which can be used to recover an approximately optimal map. The negentropy penalization in that scheme introduces, however, an estimation bias that grows with the regularization strength. A well-known remedy to debias such estimates, which has gained wide popularity among practitioners of regularized OT, is to center them, by subtracting auxiliary problems involving P_n and itself, as well as Q_n and itself. We do prove that, under favorable conditions on P and Q, debiasing can yield better approximations to the Monge map. However, and perhaps surprisingly, we present a few cases in which debiasing is provably detrimental in a statistical sense, notably when the regularization strength is large or the number of samples is small. These claims are validated experimentally on synthetic and real datasets, and should reopen the debate on whether debiasing is needed when using entropic OT.}\n}", "pdf": "https://proceedings.mlr.press/v162/pooladian22a/pooladian22a.pdf", "supp": "", "pdf_size": 569240, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16734096209921065679&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Center for Data Science, New York University, New York, USA; Google Research, currently at Apple; Courant Institute of Mathematical Sciences, New York University, New York, USA", "aff_domain": "nyu.edu; ; ", "email": "nyu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/pooladian22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "New York University;Google", "aff_unique_dep": "Center for Data Science;Google Research", "aff_unique_url": "https://www.nyu.edu;https://research.google", "aff_unique_abbr": "NYU;Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New York;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Decentralized Online Convex Optimization in Networked Systems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16991", "id": "16991", "proceeding": "https://proceedings.mlr.press/v162/lin22c.html", "poster": "/media/PosterPDFs/ICML%202022/86b3e165b8154656a71ffe8a327ded7d.png?t=1657909403.686991", "slides": "", "author_site": "Yiheng Lin, Judy Gan, Guannan Qu, Yash Kanoria, Adam Wierman", "author": "Yiheng Lin; Judy Gan; Guannan Qu; Yash Kanoria; Adam Wierman", "abstract": "We study the problem of networked online convex optimization, where each agent individually decides on an action at every time step and agents cooperatively seek to minimize the total global cost over a finite horizon. The global cost is made up of three types of local costs: convex node costs, temporal interaction costs, and spatial interaction costs. In deciding their individual action at each time, an agent has access to predictions of local cost functions for the next $k$ time steps in an $r$-hop neighborhood. Our work proposes a novel online algorithm, Localized Predictive Control (LPC), which generalizes predictive control to multi-agent systems. We show that LPC achieves a competitive ratio of $1 + \\tilde{O}(\\rho_T^k) + \\tilde{O}(\\rho_S^r)$ in an adversarial setting, where $\\rho_T$ and $\\rho_S$ are constants in $(0, 1)$ that increase with the relative strength of temporal and spatial interaction costs, respectively. This is the first competitive ratio bound on decentralized predictive control for networked online convex optimization. Further, we show that the dependence on $k$ and $r$ in our results is near optimal by lower bounding the competitive ratio of any decentralized online algorithm.", "bibtex": "@InProceedings{pmlr-v162-lin22c,\n title = \t {Decentralized Online Convex Optimization in Networked Systems},\n author = {Lin, Yiheng and Gan, Judy and Qu, Guannan and Kanoria, Yash and Wierman, Adam},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13356--13393},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lin22c/lin22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/lin22c.html},\n abstract = \t {We study the problem of networked online convex optimization, where each agent individually decides on an action at every time step and agents cooperatively seek to minimize the total global cost over a finite horizon. The global cost is made up of three types of local costs: convex node costs, temporal interaction costs, and spatial interaction costs. In deciding their individual action at each time, an agent has access to predictions of local cost functions for the next $k$ time steps in an $r$-hop neighborhood. Our work proposes a novel online algorithm, Localized Predictive Control (LPC), which generalizes predictive control to multi-agent systems. We show that LPC achieves a competitive ratio of $1 + \\tilde{O}(\\rho_T^k) + \\tilde{O}(\\rho_S^r)$ in an adversarial setting, where $\\rho_T$ and $\\rho_S$ are constants in $(0, 1)$ that increase with the relative strength of temporal and spatial interaction costs, respectively. This is the first competitive ratio bound on decentralized predictive control for networked online convex optimization. Further, we show that the dependence on $k$ and $r$ in our results is near optimal by lower bounding the competitive ratio of any decentralized online algorithm.}\n}", "pdf": "https://proceedings.mlr.press/v162/lin22c/lin22c.pdf", "supp": "", "pdf_size": 538752, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5939924955471174769&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computing and Mathematical Sciences, California Institute of Technology; Decision, Risk, and Operations division, Columbia Business School; Department of Electrical and Computer Engineering, Carnegie Mellon University; Decision, Risk, and Operations division, Columbia Business School; Department of Computing and Mathematical Sciences, California Institute of Technology", "aff_domain": "caltech.edu; ; ; ; ", "email": "caltech.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/lin22c.html", "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "California Institute of Technology;Columbia Business School;Carnegie Mellon University", "aff_unique_dep": "Department of Computing and Mathematical Sciences;Decision, Risk, and Operations division;Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.caltech.edu;https://www.gsb.columbia.edu;https://www.cmu.edu", "aff_unique_abbr": "Caltech;CBS;CMU", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Pasadena;;Pittsburgh", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Deciphering Lasso-based Classification Through a Large Dimensional Analysis of the Iterative Soft-Thresholding Algorithm", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17359", "id": "17359", "proceeding": "https://proceedings.mlr.press/v162/tiomoko22a.html", "poster": "/media/PosterPDFs/ICML%202022/1bd4b29a8e0afccd9923fe29cecb4b29.png?t=1657528465.951724", "slides": "", "author_site": "Malik TIOMOKO, Ekkehard Schnoor, Mohamed El Amine Seddik, Igor Colin, Aladin Virmaux", "author": "Malik Tiomoko; Ekkehard Schnoor; Mohamed El Amine Seddik; Igor Colin; Aladin Virmaux", "abstract": "This paper proposes a theoretical analysis of a Lasso-based classification algorithm. Leveraging on a realistic regime where the dimension of the data $p$ and their number $n$ are of the same order of magnitude, the theoretical classification error is derived as a function of the data statistics. As a result, insights into the functioning of the Lasso in classification and its differences with competing algorithms are highlighted. Our work is based on an original novel analysis of the Iterative Soft-Thresholding Algorithm (ISTA), which may be of independent interest beyond the particular problem studied here and may be adapted to similar iterative schemes. A theoretical optimization of the model\u2019s hyperparameters is also provided, which allows for the data- and time-consuming cross-validation to be avoided. Finally, several applications on synthetic and real data are provided to validate the theoretical study and justify its impact in the design and understanding of algorithms of practical interest.", "bibtex": "@InProceedings{pmlr-v162-tiomoko22a,\n title = \t {Deciphering Lasso-based Classification Through a Large Dimensional Analysis of the Iterative Soft-Thresholding Algorithm},\n author = {Tiomoko, Malik and Schnoor, Ekkehard and Seddik, Mohamed El Amine and Colin, Igor and Virmaux, Aladin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21449--21477},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tiomoko22a/tiomoko22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tiomoko22a.html},\n abstract = \t {This paper proposes a theoretical analysis of a Lasso-based classification algorithm. Leveraging on a realistic regime where the dimension of the data $p$ and their number $n$ are of the same order of magnitude, the theoretical classification error is derived as a function of the data statistics. As a result, insights into the functioning of the Lasso in classification and its differences with competing algorithms are highlighted. Our work is based on an original novel analysis of the Iterative Soft-Thresholding Algorithm (ISTA), which may be of independent interest beyond the particular problem studied here and may be adapted to similar iterative schemes. A theoretical optimization of the model\u2019s hyperparameters is also provided, which allows for the data- and time-consuming cross-validation to be avoided. Finally, several applications on synthetic and real data are provided to validate the theoretical study and justify its impact in the design and understanding of algorithms of practical interest.}\n}", "pdf": "https://proceedings.mlr.press/v162/tiomoko22a/tiomoko22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/tiomoko22a-supp.zip", "pdf_size": 876892, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5422804404713342086&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Huawei Noah\u2019s Ark Lab, Paris, France; Chair for Mathematics of Information Processing, RWTH Aachen University, Germany; Mathematical and Algorithmic Sciences Laboratory, Huawei Technologies France; Huawei Noah\u2019s Ark Lab, Paris, France; Huawei Noah\u2019s Ark Lab, Paris, France", "aff_domain": "huawei.com;mathc.rwth-aachen.de; ; ; ", "email": "huawei.com;mathc.rwth-aachen.de; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/tiomoko22a.html", "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Huawei;RWTH Aachen University", "aff_unique_dep": "Huawei Noah\u2019s Ark Lab;Chair for Mathematics of Information Processing", "aff_unique_url": "https://www.huawei.com/fr;https://www.rwth-aachen.de", "aff_unique_abbr": "HNAL;RWTH", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "France;Germany" }, { "title": "Decision-Focused Learning: Through the Lens of Learning to Rank", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18375", "id": "18375", "proceeding": "https://proceedings.mlr.press/v162/mandi22a.html", "poster": "/media/PosterPDFs/ICML%202022/bdcc41211aa62a8f10f26d1a2d1727bf.png?t=1658189760.9151764", "slides": "", "author_site": "Jayanta Mandi, V\u00edctor Bucarey, Maxime Mulamba Ke Tchomba, Tias Guns", "author": "Jayanta Mandi; V\u0131\u0301ctor Bucarey; Maxime Mulamba Ke Tchomba; Tias Guns", "abstract": "In the last years decision-focused learning framework, also known as predict-and-optimize, have received increasing attention. In this setting, the predictions of a machine learning model are used as estimated cost coefficients in the objective function of a discrete combinatorial optimization problem for decision making. Decision-focused learning proposes to train the ML models, often neural network models, by directly optimizing the quality of decisions made by the optimization solvers. Based on a recent work that proposed a noise contrastive estimation loss over a subset of the solution space, we observe that decision-focused learning can more generally be seen as a learning-to-rank problem, where the goal is to learn an objective function that ranks the feasible points correctly. This observation is independent of the optimization method used and of the form of the objective function. We develop pointwise, pairwise and listwise ranking loss functions, which can be differentiated in closed form given a subset of solutions. We empirically investigate the quality of our generic methods compared to existing decision-focused learning approaches with competitive results. Furthermore, controlling the subset of solutions allows controlling the runtime considerably, with limited effect on regret.", "bibtex": "@InProceedings{pmlr-v162-mandi22a,\n title = \t {Decision-Focused Learning: Through the Lens of Learning to Rank},\n author = {Mandi, Jayanta and Bucarey, V\\'{\\i}ctor and Tchomba, Maxime Mulamba Ke and Guns, Tias},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14935--14947},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mandi22a/mandi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mandi22a.html},\n abstract = \t {In the last years decision-focused learning framework, also known as predict-and-optimize, have received increasing attention. In this setting, the predictions of a machine learning model are used as estimated cost coefficients in the objective function of a discrete combinatorial optimization problem for decision making. Decision-focused learning proposes to train the ML models, often neural network models, by directly optimizing the quality of decisions made by the optimization solvers. Based on a recent work that proposed a noise contrastive estimation loss over a subset of the solution space, we observe that decision-focused learning can more generally be seen as a learning-to-rank problem, where the goal is to learn an objective function that ranks the feasible points correctly. This observation is independent of the optimization method used and of the form of the objective function. We develop pointwise, pairwise and listwise ranking loss functions, which can be differentiated in closed form given a subset of solutions. We empirically investigate the quality of our generic methods compared to existing decision-focused learning approaches with competitive results. Furthermore, controlling the subset of solutions allows controlling the runtime considerably, with limited effect on regret.}\n}", "pdf": "https://proceedings.mlr.press/v162/mandi22a/mandi22a.pdf", "supp": "", "pdf_size": 1809349, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=68474757504279365&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Data Analytics Laboratory, Vrije Universiteit Brus- sel, Belgium; Institute of Engineering Sciences, Universi- dad de O\u2019Higgins, Rancagua, Chile; Data Analytics Laboratory, Vrije Universiteit Brus- sel, Belgium; Dept. Computer Sci- ence, KU Leuven, Belgium", "aff_domain": "vub.be;uoh.cl; ;kuleuven.be", "email": "vub.be;uoh.cl; ;kuleuven.be", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/mandi22a.html", "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Vrije Universiteit Brussel;Universidad de O'Higgins;KU Leuven", "aff_unique_dep": "Data Analytics Laboratory;Institute of Engineering Sciences;Department of Computer Science", "aff_unique_url": "https://www.vub.be;https://www.uoh.cl;https://www.kuleuven.be", "aff_unique_abbr": ";;KU Leuven", "aff_campus_unique_index": "1", "aff_campus_unique": ";Rancagua", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Belgium;Chile" }, { "title": "Decomposing Temporal High-Order Interactions via Latent ODEs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16209", "id": "16209", "proceeding": "https://proceedings.mlr.press/v162/li22i.html", "poster": "/media/PosterPDFs/ICML%202022/c4015b7f368e6b4871809f49debe0579.png?t=1657227565.3134556", "slides": "", "author_site": "Shibo Li, Robert Kirby, Shandian Zhe", "author": "Shibo Li; Robert Kirby; Shandian Zhe", "abstract": "High-order interactions between multiple objects are common in real-world applications. Although tensor decomposition is a popular framework for high-order interaction analysis and prediction, most methods cannot well exploit the valuable timestamp information in data. The existent methods either discard the timestamps or convert them into discrete steps or use over-simplistic decomposition models. As a result, these methods might not be capable enough of capturing complex, fine-grained temporal dynamics or making accurate predictions for long-term interaction results. To overcome these limitations, we propose a novel Temporal High-order Interaction decompoSition model based on Ordinary Differential Equations (THIS-ODE). We model the time-varying interaction result with a latent ODE. To capture the complex temporal dynamics, we use a neural network (NN) to learn the time derivative of the ODE state. We use the representation of the interaction objects to model the initial value of the ODE and to constitute a part of the NN input to compute the state. In this way, the temporal relationships of the participant objects can be estimated and encoded into their representations. \tFor tractable and scalable inference, we use forward sensitivity analysis to efficiently compute the gradient of ODE state, based on which we use integral transform to develop a stochastic mini-batch learning algorithm. We demonstrate the advantage of our approach in simulation and four real-world applications.", "bibtex": "@InProceedings{pmlr-v162-li22i,\n title = \t {Decomposing Temporal High-Order Interactions via Latent {ODE}s},\n author = {Li, Shibo and Kirby, Robert and Zhe, Shandian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12797--12812},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22i/li22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22i.html},\n abstract = \t {High-order interactions between multiple objects are common in real-world applications. Although tensor decomposition is a popular framework for high-order interaction analysis and prediction, most methods cannot well exploit the valuable timestamp information in data. The existent methods either discard the timestamps or convert them into discrete steps or use over-simplistic decomposition models. As a result, these methods might not be capable enough of capturing complex, fine-grained temporal dynamics or making accurate predictions for long-term interaction results. To overcome these limitations, we propose a novel Temporal High-order Interaction decompoSition model based on Ordinary Differential Equations (THIS-ODE). We model the time-varying interaction result with a latent ODE. To capture the complex temporal dynamics, we use a neural network (NN) to learn the time derivative of the ODE state. We use the representation of the interaction objects to model the initial value of the ODE and to constitute a part of the NN input to compute the state. In this way, the temporal relationships of the participant objects can be estimated and encoded into their representations. \tFor tractable and scalable inference, we use forward sensitivity analysis to efficiently compute the gradient of ODE state, based on which we use integral transform to develop a stochastic mini-batch learning algorithm. We demonstrate the advantage of our approach in simulation and four real-world applications.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22i/li22i.pdf", "supp": "", "pdf_size": 750014, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15385085383155168460&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "School of Computing, University of Utah+Scientific Computing and Imaging (SCI) Institute, University of Utah; School of Computing, University of Utah+Scientific Computing and Imaging (SCI) Institute, University of Utah; School of Computing, University of Utah", "aff_domain": "cs.utah.edu; ;cs.utah.edu", "email": "cs.utah.edu; ;cs.utah.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/li22i.html", "aff_unique_index": "0+0;0+0;0", "aff_unique_norm": "University of Utah", "aff_unique_dep": "School of Computing", "aff_unique_url": "https://www.utah.edu", "aff_unique_abbr": "U of U", "aff_campus_unique_index": "0+1;0+1;0", "aff_campus_unique": "Utah;Salt Lake City", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Deconfounded Value Decomposition for Multi-Agent Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18101", "id": "18101", "proceeding": "https://proceedings.mlr.press/v162/li22l.html", "poster": "/media/PosterPDFs/ICML%202022/5680522b8e2bb01943234bce7bf84534.png?t=1657769166.6728706", "slides": "", "author_site": "Jiahui Li, Kun Kuang, Baoxiang Wang, Furui Liu, Long Chen, Changjie Fan, Fei Wu, Jun Xiao", "author": "Jiahui Li; Kun Kuang; Baoxiang Wang; Furui Liu; Long Chen; Changjie Fan; Fei Wu; Jun Xiao", "abstract": "Value decomposition (VD) methods have been widely used in cooperative multi-agent reinforcement learning (MARL), where credit assignment plays an important role in guiding the agents\u2019 decentralized execution. In this paper, we investigate VD from a novel perspective of causal inference. We first show that the environment in existing VD methods is an unobserved confounder as the common cause factor of the global state and the joint value function, which leads to the confounding bias on learning credit assignment. We then present our approach, deconfounded value decomposition (DVD), which cuts off the backdoor confounding path from the global state to the joint value function. The cut is implemented by introducing the", "bibtex": "@InProceedings{pmlr-v162-li22l,\n title = \t {Deconfounded Value Decomposition for Multi-Agent Reinforcement Learning},\n author = {Li, Jiahui and Kuang, Kun and Wang, Baoxiang and Liu, Furui and Chen, Long and Fan, Changjie and Wu, Fei and Xiao, Jun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12843--12856},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22l/li22l.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22l.html},\n abstract = \t {Value decomposition (VD) methods have been widely used in cooperative multi-agent reinforcement learning (MARL), where credit assignment plays an important role in guiding the agents\u2019 decentralized execution. In this paper, we investigate VD from a novel perspective of causal inference. We first show that the environment in existing VD methods is an unobserved confounder as the common cause factor of the global state and the joint value function, which leads to the confounding bias on learning credit assignment. We then present our approach, deconfounded value decomposition (DVD), which cuts off the backdoor confounding path from the global state to the joint value function. The cut is implemented by introducing the", "pdf": "https://proceedings.mlr.press/v162/li22l/li22l.pdf", "supp": "", "pdf_size": 1146393, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3217941460779065861&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "DCD Lab, College of Computer Science, Zhejiang University; DCD Lab, College of Computer Science, Zhejiang University; The Chinese University of Hong Kong, Shenzhen + Shenzhen Institute of Artificial Intelligence and Robotics for Society; Huawei Noah\u2019s Ark Lab; DCD Lab, College of Computer Science, Zhejiang University + Fuxi AI Lab, NetEase Games + Shanghai Institute for Advanced Study of Zhejiang University + Shanghai AI Laboratory; Fuxi AI Lab, NetEase Games; DCD Lab, College of Computer Science, Zhejiang University + Shanghai Institute for Advanced Study of Zhejiang University + Shanghai AI Laboratory; DCD Lab, College of Computer Science, Zhejiang University", "aff_domain": "zju.edu.cn;zju.edu.cn;szu.edu.cn;huawei.com;zju.edu.cn;163.com;zju.edu.cn;zju.edu.cn", "email": "zju.edu.cn;zju.edu.cn;szu.edu.cn;huawei.com;zju.edu.cn;163.com;zju.edu.cn;zju.edu.cn", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/li22l.html", "aff_unique_index": "0;0;1+2;3;0+4+0+5;4;0+0+5;0", "aff_unique_norm": "Zhejiang University;Chinese University of Hong Kong;Shenzhen Institute of Artificial Intelligence and Robotics for Society;Huawei;NetEase Games;Shanghai AI Laboratory", "aff_unique_dep": "College of Computer Science;;;Noah\u2019s Ark Lab;Fuxi AI Lab;", "aff_unique_url": "http://www.zju.edu.cn;https://www.cuhk.edu.cn;http://www.siarfs.org/;https://www.huawei.com;https://www.163.com;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "ZJU;CUHK;;Huawei;NetEase;SAIL", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Shenzhen;Shanghai", "aff_country_unique_index": "0;0;0+0;0;0+0+0+0;0;0+0+0;0", "aff_country_unique": "China" }, { "title": "Deduplicating Training Data Mitigates Privacy Risks in Language Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17279", "id": "17279", "proceeding": "https://proceedings.mlr.press/v162/kandpal22a.html", "poster": "/media/PosterPDFs/ICML%202022/eef6f4457ee96f8bae1893f5b234d238.png?t=1657687156.9570978", "slides": "", "author_site": "Nikhil Kandpal, Eric Wallace, Colin Raffel", "author": "Nikhil Kandpal; Eric Wallace; Colin Raffel", "abstract": "Past work has shown that large language models are susceptible to privacy attacks, where adversaries generate sequences from a trained model and detect which sequences are memorized from the training set. In this work, we show that the success of these attacks is largely due to duplication in commonly used web-scraped training sets. We first show that the rate at which language models regenerate training sequences is superlinearly related to a sequence\u2019s count in the training set. For instance, a sequence that is present 10 times in the training data is on average generated 1000x more often than a sequence that is present only once. We next show that existing methods for detecting memorized sequences have near-chance accuracy on non-duplicated training sequences. Finally, we find that after applying methods to deduplicate training data, language models are considerably more secure against these types of privacy attacks. Taken together, our results motivate an increased focus on deduplication in privacy-sensitive applications and a reevaluation of the practicality of existing privacy attacks.", "bibtex": "@InProceedings{pmlr-v162-kandpal22a,\n title = \t {Deduplicating Training Data Mitigates Privacy Risks in Language Models},\n author = {Kandpal, Nikhil and Wallace, Eric and Raffel, Colin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10697--10707},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kandpal22a/kandpal22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kandpal22a.html},\n abstract = \t {Past work has shown that large language models are susceptible to privacy attacks, where adversaries generate sequences from a trained model and detect which sequences are memorized from the training set. In this work, we show that the success of these attacks is largely due to duplication in commonly used web-scraped training sets. We first show that the rate at which language models regenerate training sequences is superlinearly related to a sequence\u2019s count in the training set. For instance, a sequence that is present 10 times in the training data is on average generated 1000x more often than a sequence that is present only once. We next show that existing methods for detecting memorized sequences have near-chance accuracy on non-duplicated training sequences. Finally, we find that after applying methods to deduplicate training data, language models are considerably more secure against these types of privacy attacks. Taken together, our results motivate an increased focus on deduplication in privacy-sensitive applications and a reevaluation of the practicality of existing privacy attacks.}\n}", "pdf": "https://proceedings.mlr.press/v162/kandpal22a/kandpal22a.pdf", "supp": "", "pdf_size": 535196, "gs_citation": 311, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6895763931305755420&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "UNC Chapel Hill; UC Berkeley; UNC Chapel Hill", "aff_domain": "cs.unc.edu; ; ", "email": "cs.unc.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kandpal22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of North Carolina at Chapel Hill;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.unc.edu;https://www.berkeley.edu", "aff_unique_abbr": "UNC;UC Berkeley", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Chapel Hill;Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Deep Causal Metric Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17251", "id": "17251", "proceeding": "https://proceedings.mlr.press/v162/deng22c.html", "poster": "/media/PosterPDFs/ICML%202022/ed519c02f134f2cdd836cba387b6a3c8.png?t=1656825747.1738558", "slides": "", "author_site": "Xiang Deng, Zhongfei Zhang", "author": "Xiang Deng; Zhongfei Zhang", "abstract": "Deep metric learning aims to learn distance metrics that measure similarities and dissimilarities between samples. The existing approaches typically focus on designing different hard sample mining or distance margin strategies and then minimize a pair/triplet-based or proxy-based loss over the training data. However, this can lead the model to recklessly learn all the correlated distances found in training data including the spurious distance (e.g., background differences) that is not the distance of interest and can harm the generalization of the learned metric. To address this issue, we study metric learning from a causality perspective and accordingly propose deep causal metric learning (DCML) that pursues the true causality of the distance between samples. DCML is achieved through explicitly learning environment-invariant attention and task-invariant embedding based on causal inference. Extensive experiments on several benchmark datasets demonstrate the superiority of DCML over the existing methods.", "bibtex": "@InProceedings{pmlr-v162-deng22c,\n title = \t {Deep Causal Metric Learning},\n author = {Deng, Xiang and Zhang, Zhongfei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4993--5006},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/deng22c/deng22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/deng22c.html},\n abstract = \t {Deep metric learning aims to learn distance metrics that measure similarities and dissimilarities between samples. The existing approaches typically focus on designing different hard sample mining or distance margin strategies and then minimize a pair/triplet-based or proxy-based loss over the training data. However, this can lead the model to recklessly learn all the correlated distances found in training data including the spurious distance (e.g., background differences) that is not the distance of interest and can harm the generalization of the learned metric. To address this issue, we study metric learning from a causality perspective and accordingly propose deep causal metric learning (DCML) that pursues the true causality of the distance between samples. DCML is achieved through explicitly learning environment-invariant attention and task-invariant embedding based on causal inference. Extensive experiments on several benchmark datasets demonstrate the superiority of DCML over the existing methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/deng22c/deng22c.pdf", "supp": "", "pdf_size": 5335142, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5270585498557648956&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Computer Science, State University of New York at Binghamton, NY, US; Department of Computer Science, State University of New York at Binghamton, NY, US", "aff_domain": "binghamton.edu; ", "email": "binghamton.edu; ", "github": "https://github.com/Xiang-Deng-DL/DCML", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/deng22c.html", "aff_unique_index": "0;0", "aff_unique_norm": "State University of New York at Binghamton", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.binghamton.edu", "aff_unique_abbr": "SUNY Binghamton", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Binghamton", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Deep Hierarchy in Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17191", "id": "17191", "proceeding": "https://proceedings.mlr.press/v162/hong22a.html", "poster": "/media/PosterPDFs/ICML%202022/878d5691c824ee2aaf770f7d36c151d6.png?t=1658012999.6178827", "slides": "", "author_site": "Joey Hong, Branislav Kveton, Sumeet Katariya, Manzil Zaheer, Mohammad Ghavamzadeh", "author": "Joey Hong; Branislav Kveton; Sumeet Katariya; Manzil Zaheer; Mohammad Ghavamzadeh", "abstract": "Mean rewards of actions are often correlated. The form of these correlations may be complex and unknown a priori, such as the preferences of users for recommended products and their categories. To maximize statistical efficiency, it is important to leverage these correlations when learning. We formulate a bandit variant of this problem where the correlations of mean action rewards are represented by a hierarchical Bayesian model with latent variables. Since the hierarchy can have multiple layers, we call it deep. We propose a hierarchical Thompson sampling algorithm (HierTS) for this problem and show how to implement it efficiently for Gaussian hierarchies. The efficient implementation is possible due to a novel exact hierarchical representation of the posterior, which itself is of independent interest. We use this exact posterior to analyze the Bayes regret of HierTS. Our regret bounds reflect the structure of the problem, that the regret decreases with more informative priors, and can be recast to highlight reduced dependence on the number of actions. We confirm these theoretical findings empirically, in both synthetic and real-world experiments.", "bibtex": "@InProceedings{pmlr-v162-hong22a,\n title = \t {Deep Hierarchy in Bandits},\n author = {Hong, Joey and Kveton, Branislav and Katariya, Sumeet and Zaheer, Manzil and Ghavamzadeh, Mohammad},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8833--8851},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hong22a/hong22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hong22a.html},\n abstract = \t {Mean rewards of actions are often correlated. The form of these correlations may be complex and unknown a priori, such as the preferences of users for recommended products and their categories. To maximize statistical efficiency, it is important to leverage these correlations when learning. We formulate a bandit variant of this problem where the correlations of mean action rewards are represented by a hierarchical Bayesian model with latent variables. Since the hierarchy can have multiple layers, we call it deep. We propose a hierarchical Thompson sampling algorithm (HierTS) for this problem and show how to implement it efficiently for Gaussian hierarchies. The efficient implementation is possible due to a novel exact hierarchical representation of the posterior, which itself is of independent interest. We use this exact posterior to analyze the Bayes regret of HierTS. Our regret bounds reflect the structure of the problem, that the regret decreases with more informative priors, and can be recast to highlight reduced dependence on the number of actions. We confirm these theoretical findings empirically, in both synthetic and real-world experiments.}\n}", "pdf": "https://proceedings.mlr.press/v162/hong22a/hong22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/hong22a-supp.zip", "pdf_size": 2870869, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4261705765797812318&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "University of California, Berkeley; Amazon; Amazon; DeepMind; Google Research", "aff_domain": "berkeley.edu; ; ; ; ", "email": "berkeley.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/hong22a.html", "aff_unique_index": "0;1;1;2;3", "aff_unique_norm": "University of California, Berkeley;Amazon;DeepMind;Google", "aff_unique_dep": ";Amazon.com, Inc.;;Google Research", "aff_unique_url": "https://www.berkeley.edu;https://www.amazon.com;https://deepmind.com;https://research.google", "aff_unique_abbr": "UC Berkeley;Amazon;DeepMind;Google Research", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Deep Network Approximation in Terms of Intrinsic Parameters", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18197", "id": "18197", "proceeding": "https://proceedings.mlr.press/v162/shen22g.html", "poster": "/media/PosterPDFs/ICML%202022/bd294168a234d75851d6f26f02723ab1.png?t=1657216759.4429069", "slides": "", "author_site": "Zuowei Shen, Haizhao Yang, Shijun Zhang", "author": "Zuowei Shen; Haizhao Yang; Shijun Zhang", "abstract": "One of the arguments to explain the success of deep learning is the powerful approximation capacity of deep neural networks. Such capacity is generally accompanied by the explosive growth of the number of parameters, which, in turn, leads to high computational costs. It is of great interest to ask whether we can achieve successful deep learning with a small number of learnable parameters adapting to the target function. From an approximation perspective, this paper shows that the number of parameters that need to be learned can be significantly smaller than people typically expect. First, we theoretically design ReLU networks with a few learnable parameters to achieve an attractive approximation. We prove by construction that, for any Lipschitz continuous function $f$ on $[0,1]^d$ with a Lipschitz constant $\\lambda>0$, a ReLU network with $n+2$ intrinsic parameters (those depending on $f$) can approximate $f$ with an exponentially small error $5 \\lambda \\sqrt{d} \\, 2^{-n}$. Such a result is generalized to generic continuous functions. Furthermore, we show that the idea of learning a small number of parameters to achieve a good approximation can be numerically observed. We conduct several experiments to verify that training a small part of parameters can also achieve good results for classification problems if other parameters are pre-specified or pre-trained from a related problem.", "bibtex": "@InProceedings{pmlr-v162-shen22g,\n title = \t {Deep Network Approximation in Terms of Intrinsic Parameters},\n author = {Shen, Zuowei and Yang, Haizhao and Zhang, Shijun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19909--19934},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shen22g/shen22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/shen22g.html},\n abstract = \t {One of the arguments to explain the success of deep learning is the powerful approximation capacity of deep neural networks. Such capacity is generally accompanied by the explosive growth of the number of parameters, which, in turn, leads to high computational costs. It is of great interest to ask whether we can achieve successful deep learning with a small number of learnable parameters adapting to the target function. From an approximation perspective, this paper shows that the number of parameters that need to be learned can be significantly smaller than people typically expect. First, we theoretically design ReLU networks with a few learnable parameters to achieve an attractive approximation. We prove by construction that, for any Lipschitz continuous function $f$ on $[0,1]^d$ with a Lipschitz constant $\\lambda>0$, a ReLU network with $n+2$ intrinsic parameters (those depending on $f$) can approximate $f$ with an exponentially small error $5 \\lambda \\sqrt{d} \\, 2^{-n}$. Such a result is generalized to generic continuous functions. Furthermore, we show that the idea of learning a small number of parameters to achieve a good approximation can be numerically observed. We conduct several experiments to verify that training a small part of parameters can also achieve good results for classification problems if other parameters are pre-specified or pre-trained from a related problem.}\n}", "pdf": "https://proceedings.mlr.press/v162/shen22g/shen22g.pdf", "supp": "", "pdf_size": 752504, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13052507772988452510&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Mathematics, National University of Singapore; Department of Mathematics, University of Maryland, College Park; Department of Mathematics, National University of Singapore", "aff_domain": "u.nus.edu; ;u.nus.edu", "email": "u.nus.edu; ;u.nus.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/shen22g.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "National University of Singapore;University of Maryland", "aff_unique_dep": "Department of Mathematics;Department of Mathematics", "aff_unique_url": "https://www.nus.edu.sg;https://www/umd.edu", "aff_unique_abbr": "NUS;UMD", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Singapore;United States" }, { "title": "Deep Networks on Toroids: Removing Symmetries Reveals the Structure of Flat Regions in the Landscape Geometry", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17897", "id": "17897", "proceeding": "https://proceedings.mlr.press/v162/pittorino22a.html", "poster": "/media/PosterPDFs/ICML%202022/d2a10b0bd670e442b1d3caa3fbf9e695.png?t=1658259367.4932873", "slides": "", "author_site": "Fabrizio Pittorino, Antonio Ferraro, Gabriele Perugini, Christoph Feinauer, Carlo Baldassi, RIccardo Zecchina", "author": "Fabrizio Pittorino; Antonio Ferraro; Gabriele Perugini; Christoph Feinauer; Carlo Baldassi; Riccardo Zecchina", "abstract": "We systematize the approach to the investigation of deep neural network landscapes by basing it on the geometry of the space of implemented functions rather than the space of parameters. Grouping classifiers into equivalence classes, we develop a standardized parameterization in which all symmetries are removed, resulting in a toroidal topology. On this space, we explore the error landscape rather than the loss. This lets us derive a meaningful notion of the flatness of minimizers and of the geodesic paths connecting them. Using different optimization algorithms that sample minimizers with different flatness we study the mode connectivity and relative distances. Testing a variety of state-of-the-art architectures and benchmark datasets, we confirm the correlation between flatness and generalization performance; we further show that in function space flatter minima are closer to each other and that the barriers along the geodesics connecting them are small. We also find that minimizers found by variants of gradient descent can be connected by zero-error paths composed of two straight lines in parameter space, i.e. polygonal chains with a single bend. We observe similar qualitative results in neural networks with binary weights and activations, providing one of the first results concerning the connectivity in this setting. Our results hinge on symmetry removal, and are in remarkable agreement with the rich phenomenology described by some recent analytical studies performed on simple shallow models.", "bibtex": "@InProceedings{pmlr-v162-pittorino22a,\n title = \t {Deep Networks on Toroids: Removing Symmetries Reveals the Structure of Flat Regions in the Landscape Geometry},\n author = {Pittorino, Fabrizio and Ferraro, Antonio and Perugini, Gabriele and Feinauer, Christoph and Baldassi, Carlo and Zecchina, Riccardo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17759--17781},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pittorino22a/pittorino22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pittorino22a.html},\n abstract = \t {We systematize the approach to the investigation of deep neural network landscapes by basing it on the geometry of the space of implemented functions rather than the space of parameters. Grouping classifiers into equivalence classes, we develop a standardized parameterization in which all symmetries are removed, resulting in a toroidal topology. On this space, we explore the error landscape rather than the loss. This lets us derive a meaningful notion of the flatness of minimizers and of the geodesic paths connecting them. Using different optimization algorithms that sample minimizers with different flatness we study the mode connectivity and relative distances. Testing a variety of state-of-the-art architectures and benchmark datasets, we confirm the correlation between flatness and generalization performance; we further show that in function space flatter minima are closer to each other and that the barriers along the geodesics connecting them are small. We also find that minimizers found by variants of gradient descent can be connected by zero-error paths composed of two straight lines in parameter space, i.e. polygonal chains with a single bend. We observe similar qualitative results in neural networks with binary weights and activations, providing one of the first results concerning the connectivity in this setting. Our results hinge on symmetry removal, and are in remarkable agreement with the rich phenomenology described by some recent analytical studies performed on simple shallow models.}\n}", "pdf": "https://proceedings.mlr.press/v162/pittorino22a/pittorino22a.pdf", "supp": "", "pdf_size": 2026253, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5996664399634016980&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "AI Lab, Institute for Data Science and Analytics, Bocconi University, 20136 Milano, Italy; AI Lab, Institute for Data Science and Analytics, Bocconi University, 20136 Milano, Italy; AI Lab, Institute for Data Science and Analytics, Bocconi University, 20136 Milano, Italy + Dept. of Applied Science and Technology, Politecnico di Torino, 10129 Torino, Italy; AI Lab, Institute for Data Science and Analytics, Bocconi University, 20136 Milano, Italy; AI Lab, Institute for Data Science and Analytics, Bocconi University, 20136 Milano, Italy; AI Lab, Institute for Data Science and Analytics, Bocconi University, 20136 Milano, Italy", "aff_domain": "unibocconi.it; ; ; ;unibocconi.it; ", "email": "unibocconi.it; ; ; ;unibocconi.it; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/pittorino22a.html", "aff_unique_index": "0;0;0+1;0;0;0", "aff_unique_norm": "Bocconi University;Politecnico di Torino", "aff_unique_dep": "Institute for Data Science and Analytics;Dept. of Applied Science and Technology", "aff_unique_url": "https://www.bocconi.edu;https://www.polito.it", "aff_unique_abbr": "Bocconi;Politecnico di Torino", "aff_campus_unique_index": "0;0;0+1;0;0;0", "aff_campus_unique": "Milano;Torino", "aff_country_unique_index": "0;0;0+0;0;0;0", "aff_country_unique": "Italy" }, { "title": "Deep Neural Network Fusion via Graph Matching with Applications to Model Ensemble and Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17911", "id": "17911", "proceeding": "https://proceedings.mlr.press/v162/liu22k.html", "poster": "/media/PosterPDFs/ICML%202022/42a3964579017f3cb42b26605b9ae8ef_uDyPC7E.png?t=1657530263.712678", "slides": "", "author_site": "Chang Liu, Chenfei Lou, Runzhong Wang, Alan Yuhan Xi, Li Shen, Junchi Yan", "author": "Chang Liu; Chenfei Lou; Runzhong Wang; Alan Yuhan Xi; Li Shen; Junchi Yan", "abstract": "Model fusion without accessing training data in machine learning has attracted increasing interest due to the practical resource-saving and data privacy issues. During the training process, the neural weights of each model can be randomly permuted, and we have to align the channels of each layer before fusing them. Regrading the channels as nodes and weights as edges, aligning the channels to maximize weight similarity is a challenging NP-hard assignment problem. Due to its quadratic assignment nature, we formulate the model fusion problem as a graph matching task, considering the second-order similarity of model weights instead of previous work merely formulating model fusion as a linear assignment problem. For the rising problem scale and multi-model consistency issues, we propose an efficient graduated assignment-based model fusion method, dubbed GAMF, which iteratively updates the matchings in a consistency-maintaining manner. We apply GAMF to tackle the compact model ensemble task and federated learning task on MNIST, CIFAR-10, CIFAR-100, and Tiny-Imagenet. The performance shows the efficacy of our GAMF compared to state-of-the-art baselines.", "bibtex": "@InProceedings{pmlr-v162-liu22k,\n title = \t {Deep Neural Network Fusion via Graph Matching with Applications to Model Ensemble and Federated Learning},\n author = {Liu, Chang and Lou, Chenfei and Wang, Runzhong and Xi, Alan Yuhan and Shen, Li and Yan, Junchi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13857--13869},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22k/liu22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22k.html},\n abstract = \t {Model fusion without accessing training data in machine learning has attracted increasing interest due to the practical resource-saving and data privacy issues. During the training process, the neural weights of each model can be randomly permuted, and we have to align the channels of each layer before fusing them. Regrading the channels as nodes and weights as edges, aligning the channels to maximize weight similarity is a challenging NP-hard assignment problem. Due to its quadratic assignment nature, we formulate the model fusion problem as a graph matching task, considering the second-order similarity of model weights instead of previous work merely formulating model fusion as a linear assignment problem. For the rising problem scale and multi-model consistency issues, we propose an efficient graduated assignment-based model fusion method, dubbed GAMF, which iteratively updates the matchings in a consistency-maintaining manner. We apply GAMF to tackle the compact model ensemble task and federated learning task on MNIST, CIFAR-10, CIFAR-100, and Tiny-Imagenet. The performance shows the efficacy of our GAMF compared to state-of-the-art baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22k/liu22k.pdf", "supp": "", "pdf_size": 1014646, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13000348666462381813&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science and Engineering, and MoE Key Lab of AI, Shanghai Jiao Tong University; Department of Computer Science and Engineering, and MoE Key Lab of AI, Shanghai Jiao Tong University; Department of Computer Science and Engineering, and MoE Key Lab of AI, Shanghai Jiao Tong University; University of Wisconsin Madison; JD Explore Academy; Shanghai AI Laboratory + Department of Computer Science and Engineering, and MoE Key Lab of AI, Shanghai Jiao Tong University", "aff_domain": "sjtu.edu.cn; ; ; ; ;sjtu.edu.cn", "email": "sjtu.edu.cn; ; ; ; ;sjtu.edu.cn", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/liu22k.html", "aff_unique_index": "0;0;0;1;2;3+0", "aff_unique_norm": "Shanghai Jiao Tong University;University of Wisconsin-Madison;JD;Shanghai AI Laboratory", "aff_unique_dep": "Department of Computer Science and Engineering;;JD Explore Academy;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.wisc.edu;;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "SJTU;UW-Madison;;SAIL", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Madison", "aff_country_unique_index": "0;0;0;1;0+0", "aff_country_unique": "China;United States;" }, { "title": "Deep Probability Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16299", "id": "16299", "proceeding": "https://proceedings.mlr.press/v162/liu22f.html", "poster": "/media/PosterPDFs/ICML%202022/fc9e62695def29ccdb9eb3fed5b4c8c8.png?t=1657812914.80251", "slides": "", "author_site": "Sheng Liu, Aakash Kaku, Weicheng Zhu, Matan Leibovich, Sreyas Mohan, Boyang Yu, Haoxiang Huang, Laure Zanna, Narges Razavian, Jonathan Niles-Weed, Carlos Fernandez-Granda", "author": "Sheng Liu; Aakash Kaku; Weicheng Zhu; Matan Leibovich; Sreyas Mohan; Boyang Yu; Haoxiang Huang; Laure Zanna; Narges Razavian; Jonathan Niles-Weed; Carlos Fernandez-Granda", "abstract": "Reliable probability estimation is of crucial importance in many real-world applications where there is inherent (aleatoric) uncertainty. Probability-estimation models are trained on observed outcomes (e.g. whether it has rained or not, or whether a patient has died or not), because the ground-truth probabilities of the events of interest are typically unknown. The problem is therefore analogous to binary classification, with the difference that the objective is to estimate probabilities rather than predicting the specific outcome. This work investigates probability estimation from high-dimensional data using deep neural networks. There exist several methods to improve the probabilities generated by these models but they mostly focus on model (epistemic) uncertainty. For problems with inherent uncertainty, it is challenging to evaluate performance without access to ground-truth probabilities. To address this, we build a synthetic dataset to study and compare different computable metrics. We evaluate existing methods on the synthetic data as well as on three real-world probability estimation tasks, all of which involve inherent uncertainty: precipitation forecasting from radar images, predicting cancer patient survival from histopathology images, and predicting car crashes from dashcam videos. We also give a theoretical analysis of a model for high-dimensional probability estimation which reproduces several of the phenomena evinced in our experiments. Finally, we propose a new method for probability estimation using neural networks, which modifies the training process to promote output probabilities that are consistent with empirical probabilities computed from the data. The method outperforms existing approaches on most metrics on the simulated as well as real-world data.", "bibtex": "@InProceedings{pmlr-v162-liu22f,\n title = \t {Deep Probability Estimation},\n author = {Liu, Sheng and Kaku, Aakash and Zhu, Weicheng and Leibovich, Matan and Mohan, Sreyas and Yu, Boyang and Huang, Haoxiang and Zanna, Laure and Razavian, Narges and Niles-Weed, Jonathan and Fernandez-Granda, Carlos},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13746--13781},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22f/liu22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22f.html},\n abstract = \t {Reliable probability estimation is of crucial importance in many real-world applications where there is inherent (aleatoric) uncertainty. Probability-estimation models are trained on observed outcomes (e.g. whether it has rained or not, or whether a patient has died or not), because the ground-truth probabilities of the events of interest are typically unknown. The problem is therefore analogous to binary classification, with the difference that the objective is to estimate probabilities rather than predicting the specific outcome. This work investigates probability estimation from high-dimensional data using deep neural networks. There exist several methods to improve the probabilities generated by these models but they mostly focus on model (epistemic) uncertainty. For problems with inherent uncertainty, it is challenging to evaluate performance without access to ground-truth probabilities. To address this, we build a synthetic dataset to study and compare different computable metrics. We evaluate existing methods on the synthetic data as well as on three real-world probability estimation tasks, all of which involve inherent uncertainty: precipitation forecasting from radar images, predicting cancer patient survival from histopathology images, and predicting car crashes from dashcam videos. We also give a theoretical analysis of a model for high-dimensional probability estimation which reproduces several of the phenomena evinced in our experiments. Finally, we propose a new method for probability estimation using neural networks, which modifies the training process to promote output probabilities that are consistent with empirical probabilities computed from the data. The method outperforms existing approaches on most metrics on the simulated as well as real-world data.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22f/liu22f.pdf", "supp": "", "pdf_size": 18947851, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15071615876185321831&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Center for Data Science, New York University, New York, USA+Courant Institute of Mathematical Sciences, New York University, New York, USA; Center for Data Science, New York University, New York, USA+Courant Institute of Mathematical Sciences, New York University, New York, USA; Center for Data Science, New York University, New York, USA+Courant Institute of Mathematical Sciences, New York University, New York, USA; Center for Data Science, New York University, New York, USA+Courant Institute of Mathematical Sciences, New York University, New York, USA; Center for Data Science, New York University, New York, USA+Courant Institute of Mathematical Sciences, New York University, New York, USA; Center for Data Science, New York University, New York, USA; Courant Institute of Mathematical Sciences, New York University, New York, USA; Center for Data Science, New York University, New York, USA+Courant Institute of Mathematical Sciences, New York University, New York, USA; Department of Population Health & Department of Radiology, NYU School of Medicine, New York, USA; Center for Data Science, New York University, New York, USA+Courant Institute of Mathematical Sciences, New York University, New York, USA; Center for Data Science, New York University, New York, USA+Courant Institute of Mathematical Sciences, New York University, New York, USA", "aff_domain": "nyu.edu;nyu.edu;nyu.edu;nyu.edu;nyu.edu; ; ; ;nyu.edu; ; ", "email": "nyu.edu;nyu.edu;nyu.edu;nyu.edu;nyu.edu; ; ; ;nyu.edu; ; ", "github": "https://jackzhu727.github.io/deep-probability-estimation/", "project": "", "author_num": 11, "oa": "https://proceedings.mlr.press/v162/liu22f.html", "aff_unique_index": "0+0;0+0;0+0;0+0;0+0;0;0;0+0;1;0+0;0+0", "aff_unique_norm": "New York University;NYU School of Medicine", "aff_unique_dep": "Center for Data Science;Department of Population Health", "aff_unique_url": "https://www.nyu.edu;https://med.nyu.edu", "aff_unique_abbr": "NYU;NYU School of Medicine", "aff_campus_unique_index": "0+0;0+0;0+0;0+0;0+0;0;0;0+0;0;0+0;0+0", "aff_campus_unique": "New York", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0;0;0+0;0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "Deep Reference Priors: What is the best way to pretrain a model?", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17573", "id": "17573", "proceeding": "https://proceedings.mlr.press/v162/gao22d.html", "poster": "/media/PosterPDFs/ICML%202022/02b1be0d48924c327124732726097157.png?t=1657815708.942869", "slides": "", "author_site": "Yansong Gao, Rahul Ramesh, Pratik Chaudhari", "author": "Yansong Gao; Rahul Ramesh; Pratik Chaudhari", "abstract": "What is the best way to exploit extra data \u2013 be it unlabeled data from the same task, or labeled data from a related task \u2013 to learn a given task? This paper formalizes the question using the theory of reference priors. Reference priors are objective, uninformative Bayesian priors that maximize the mutual information between the task and the weights of the model. Such priors enable the task to maximally affect the Bayesian posterior, e.g., reference priors depend upon the number of samples available for learning the task and for very small sample sizes, the prior puts more probability mass on low-complexity models in the hypothesis space. This paper presents the first demonstration of reference priors for medium-scale deep networks and image-based data. We develop generalizations of reference priors and demonstrate applications to two problems. First, by using unlabeled data to compute the reference prior, we develop new Bayesian semi-supervised learning methods that remain effective even with very few samples per class. Second, by using labeled data from the source task to compute the reference prior, we develop a new pretraining method for transfer learning that allows data from the target task to maximally affect the Bayesian posterior. Empirical validation of these methods is conducted on image classification datasets. Code is available at https://github.com/grasp-lyrl/deep_reference_priors", "bibtex": "@InProceedings{pmlr-v162-gao22d,\n title = \t {Deep Reference Priors: What is the best way to pretrain a model?},\n author = {Gao, Yansong and Ramesh, Rahul and Chaudhari, Pratik},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7036--7051},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gao22d/gao22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/gao22d.html},\n abstract = \t {What is the best way to exploit extra data \u2013 be it unlabeled data from the same task, or labeled data from a related task \u2013 to learn a given task? This paper formalizes the question using the theory of reference priors. Reference priors are objective, uninformative Bayesian priors that maximize the mutual information between the task and the weights of the model. Such priors enable the task to maximally affect the Bayesian posterior, e.g., reference priors depend upon the number of samples available for learning the task and for very small sample sizes, the prior puts more probability mass on low-complexity models in the hypothesis space. This paper presents the first demonstration of reference priors for medium-scale deep networks and image-based data. We develop generalizations of reference priors and demonstrate applications to two problems. First, by using unlabeled data to compute the reference prior, we develop new Bayesian semi-supervised learning methods that remain effective even with very few samples per class. Second, by using labeled data from the source task to compute the reference prior, we develop a new pretraining method for transfer learning that allows data from the target task to maximally affect the Bayesian posterior. Empirical validation of these methods is conducted on image classification datasets. Code is available at https://github.com/grasp-lyrl/deep_reference_priors}\n}", "pdf": "https://proceedings.mlr.press/v162/gao22d/gao22d.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/gao22d-supp.zip", "pdf_size": 6890973, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4730027445871462981&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Applied Mathematics and Computational Sciences, University of Pennsylvania+Computer and Information Science, University of Pennsylvania; Computer and Information Science, University of Pennsylvania+Applied Mathematics and Computational Sciences, University of Pennsylvania; Electrical and Systems Engineering, University of Pennsylvania", "aff_domain": "sas.upenn.edu;seas.upenn.edu;seas.upenn.edu", "email": "sas.upenn.edu;seas.upenn.edu;seas.upenn.edu", "github": "https://github.com/grasp-lyrl/deep-reference-priors", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/gao22d.html", "aff_unique_index": "0+0;0+0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "Department of Applied Mathematics and Computational Sciences", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Deep Safe Incomplete Multi-view Clustering: Theorem and Algorithm", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18209", "id": "18209", "proceeding": "https://proceedings.mlr.press/v162/tang22c.html", "poster": "/media/PosterPDFs/ICML%202022/ad8e88c0f76fa4fc8e5474384142a00a.png?t=1657522605.1700804", "slides": "", "author_site": "Huayi Tang, Yong Liu", "author": "Huayi Tang; Yong Liu", "abstract": "Incomplete multi-view clustering is a significant but challenging task. Although jointly imputing incomplete samples and conducting clustering has been shown to achieve promising performance, learning from both complete and incomplete data may be worse than learning only from complete data, particularly when imputed views are semantic inconsistent with missing views. To address this issue, we propose a novel framework to reduce the clustering performance degradation risk from semantic inconsistent imputed views. Concretely, by the proposed bi-level optimization framework, missing views are dynamically imputed from the learned semantic neighbors, and imputed samples are automatically selected for training. In theory, the empirical risk of the model is no higher than learning only from complete data, and the model is never worse than learning only from complete data in terms of expected risk with high probability. Comprehensive experiments demonstrate that the proposed method achieves superior performance and efficient safe incomplete multi-view clustering.", "bibtex": "@InProceedings{pmlr-v162-tang22c,\n title = \t {Deep Safe Incomplete Multi-view Clustering: Theorem and Algorithm},\n author = {Tang, Huayi and Liu, Yong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21090--21110},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tang22c/tang22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/tang22c.html},\n abstract = \t {Incomplete multi-view clustering is a significant but challenging task. Although jointly imputing incomplete samples and conducting clustering has been shown to achieve promising performance, learning from both complete and incomplete data may be worse than learning only from complete data, particularly when imputed views are semantic inconsistent with missing views. To address this issue, we propose a novel framework to reduce the clustering performance degradation risk from semantic inconsistent imputed views. Concretely, by the proposed bi-level optimization framework, missing views are dynamically imputed from the learned semantic neighbors, and imputed samples are automatically selected for training. In theory, the empirical risk of the model is no higher than learning only from complete data, and the model is never worse than learning only from complete data in terms of expected risk with high probability. Comprehensive experiments demonstrate that the proposed method achieves superior performance and efficient safe incomplete multi-view clustering.}\n}", "pdf": "https://proceedings.mlr.press/v162/tang22c/tang22c.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/tang22c-supp.zip", "pdf_size": 5484894, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5842366095338485406&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China+Beijing Key Laboratory of Big Data Management and Analysis Methods, Beijing, China; Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China+Beijing Key Laboratory of Big Data Management and Analysis Methods, Beijing, China", "aff_domain": "ruc.edu.cn;ruc.edu.cn", "email": "ruc.edu.cn;ruc.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/tang22c.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Renmin University of China;Beijing Key Laboratory of Big Data Management and Analysis Methods", "aff_unique_dep": "Gaoling School of Artificial Intelligence;Big Data Management and Analysis", "aff_unique_url": "http://www.ruc.edu.cn;", "aff_unique_abbr": "RUC;", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "China" }, { "title": "Deep Squared Euclidean Approximation to the Levenshtein Distance for DNA Storage", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16159", "id": "16159", "proceeding": "https://proceedings.mlr.press/v162/guo22f.html", "poster": "/media/PosterPDFs/ICML%202022/275d7fb2fd45098ad5c3ece2ed4a2824.png?t=1657208774.6308804", "slides": "", "author_site": "Alan J.X. Guo, Cong Liang, Qing-Hu Hou", "author": "Alan J.X. Guo; Cong Liang; Qing-Hu Hou", "abstract": "Storing information in DNA molecules is of great interest because of its advantages in longevity, high storage density, and low maintenance cost. A key step in the DNA storage pipeline is to efficiently cluster the retrieved DNA sequences according to their similarities. Levenshtein distance is the most suitable metric on the similarity between two DNA sequences, but it is inferior in terms of computational complexity and less compatible with mature clustering algorithms. In this work, we propose a novel deep squared Euclidean embedding for DNA sequences using Siamese neural network, squared Euclidean embedding, and chi-squared regression. The Levenshtein distance is approximated by the squared Euclidean distance between the embedding vectors, which is fast calculated and clustering algorithm friendly. The proposed approach is analyzed theoretically and experimentally. The results show that the proposed embedding is efficient and robust.", "bibtex": "@InProceedings{pmlr-v162-guo22f,\n title = \t {Deep Squared {E}uclidean Approximation to the Levenshtein Distance for {DNA} Storage},\n author = {Guo, Alan J.X. and Liang, Cong and Hou, Qing-Hu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8095--8108},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guo22f/guo22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/guo22f.html},\n abstract = \t {Storing information in DNA molecules is of great interest because of its advantages in longevity, high storage density, and low maintenance cost. A key step in the DNA storage pipeline is to efficiently cluster the retrieved DNA sequences according to their similarities. Levenshtein distance is the most suitable metric on the similarity between two DNA sequences, but it is inferior in terms of computational complexity and less compatible with mature clustering algorithms. In this work, we propose a novel deep squared Euclidean embedding for DNA sequences using Siamese neural network, squared Euclidean embedding, and chi-squared regression. The Levenshtein distance is approximated by the squared Euclidean distance between the embedding vectors, which is fast calculated and clustering algorithm friendly. The proposed approach is analyzed theoretically and experimentally. The results show that the proposed embedding is efficient and robust.}\n}", "pdf": "https://proceedings.mlr.press/v162/guo22f/guo22f.pdf", "supp": "", "pdf_size": 1412064, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5772443543166208846&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Center for Applied Mathematics, Tianjin University, Tianjin, China; Center for Applied Mathematics, Tianjin University, Tianjin, China; School of Mathematics, Tianjin University, Tianjin, China", "aff_domain": "tju.edu.cn; ; ", "email": "tju.edu.cn; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/guo22f.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Tianjin University", "aff_unique_dep": "Center for Applied Mathematics", "aff_unique_url": "http://www.tju.edu.cn", "aff_unique_abbr": "Tianjin U", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Tianjin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Deep Variational Graph Convolutional Recurrent Network for Multivariate Time Series Anomaly Detection", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18109", "id": "18109", "proceeding": "https://proceedings.mlr.press/v162/chen22x.html", "poster": "/media/PosterPDFs/ICML%202022/a9be4c2a4041cadbf9d61ae16dd1389e.png?t=1657424395.2332919", "slides": "", "author_site": "Wenchao Chen, Long Tian, Bo Chen, Liang Dai, Zhibin Duan, Mingyuan Zhou", "author": "Wenchao Chen; Long Tian; Bo Chen; Liang Dai; Zhibin Duan; Mingyuan Zhou", "abstract": "Anomaly detection within multivariate time series (MTS) is an essential task in both data mining and service quality management. Many recent works on anomaly detection focus on designing unsupervised probabilistic models to extract robust normal patterns of MTS. In this paper, we model sensor dependency and stochasticity within MTS by developing an embedding-guided probabilistic generative network. We combine it with adaptive variational graph convolutional recurrent network %and get variational GCRN (VGCRN) to model both spatial and temporal fine-grained correlations in MTS. To explore hierarchical latent representations, we further extend VGCRN into a deep variational network, which captures multilevel information at different layers and is robust to noisy time series. Moreover, we develop an upward-downward variational inference scheme that considers both forecasting-based and reconstruction-based losses, achieving an accurate posterior approximation of latent variables with better MTS representations. The experiments verify the superiority of the proposed method over current state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v162-chen22x,\n title = \t {Deep Variational Graph Convolutional Recurrent Network for Multivariate Time Series Anomaly Detection},\n author = {Chen, Wenchao and Tian, Long and Chen, Bo and Dai, Liang and Duan, Zhibin and Zhou, Mingyuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3621--3633},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22x/chen22x.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22x.html},\n abstract = \t {Anomaly detection within multivariate time series (MTS) is an essential task in both data mining and service quality management. Many recent works on anomaly detection focus on designing unsupervised probabilistic models to extract robust normal patterns of MTS. In this paper, we model sensor dependency and stochasticity within MTS by developing an embedding-guided probabilistic generative network. We combine it with adaptive variational graph convolutional recurrent network %and get variational GCRN (VGCRN) to model both spatial and temporal fine-grained correlations in MTS. To explore hierarchical latent representations, we further extend VGCRN into a deep variational network, which captures multilevel information at different layers and is robust to noisy time series. Moreover, we develop an upward-downward variational inference scheme that considers both forecasting-based and reconstruction-based losses, achieving an accurate posterior approximation of latent variables with better MTS representations. The experiments verify the superiority of the proposed method over current state-of-the-art methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22x/chen22x.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chen22x-supp.zip", "pdf_size": 1836576, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8246688240920124360&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "National Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, China; National Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, China; National Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, China; Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China; National Laboratory of Radar Signal Processing, Xidian University, Xi\u2019an, China; McCombs School of Business, The University of Texas at Austin, Austin, TX 78712, USA", "aff_domain": "mail.xidian.edu.com; ; ; ; ; ", "email": "mail.xidian.edu.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/chen22x.html", "aff_unique_index": "0;0;0;1;0;2", "aff_unique_norm": "Xidian University;Chinese Academy of Sciences;University of Texas at Austin", "aff_unique_dep": "National Laboratory of Radar Signal Processing;Institute of Information Engineering;McCombs School of Business", "aff_unique_url": "http://www.xidian.edu.cn/;http://www.cas.cn;https://www.mccombs.utexas.edu", "aff_unique_abbr": "Xidian;CAS;UT Austin", "aff_campus_unique_index": "0;0;0;1;0;2", "aff_campus_unique": "Xi'an;Beijing;Austin", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Deep and Flexible Graph Neural Architecture Search", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18175", "id": "18175", "proceeding": "https://proceedings.mlr.press/v162/zhang22s.html", "poster": "", "slides": "", "author_site": "Wentao Zhang, Zheyu Lin, Yu Shen, Yang Li, Zhi Yang, Bin Cui", "author": "Wentao Zhang; Zheyu Lin; Yu Shen; Yang Li; Zhi Yang; Bin Cui", "abstract": "Graph neural networks (GNNs) have been intensively applied to various graph-based applications. Despite their success, designing good GNN architectures is non-trivial, which heavily relies on lots of human efforts and domain knowledge. Although several attempts have been made in graph neural architecture search, they suffer from the following limitations: 1) fixed pipeline pattern of propagation (P) and (T) transformation operations; 2) restricted pipeline depth of GNN architectures. This paper proposes DFG-NAS, a novel method that searches for deep and flexible GNN architectures. Unlike most existing methods that focus on micro-architecture, DFG-NAS highlights another level of design: the search for macro-architectures of how atomic P and T are integrated and organized into a GNN. Concretely, DFG-NAS proposes a novel-designed search space for the P-T permutations and combinations based on the message-passing dis-aggregation, and defines various mutation strategies and employs the evolutionary algorithm to conduct an efficient and effective search. Empirical studies on four benchmark datasets demonstrate that DFG-NAS could find more powerful architectures than state-of-the-art manual designs and meanwhile are more efficient than the current graph neural architecture search approaches.", "bibtex": "@InProceedings{pmlr-v162-zhang22s,\n title = \t {Deep and Flexible Graph Neural Architecture Search},\n author = {Zhang, Wentao and Lin, Zheyu and Shen, Yu and Li, Yang and Yang, Zhi and Cui, Bin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26362--26374},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22s/zhang22s.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22s.html},\n abstract = \t {Graph neural networks (GNNs) have been intensively applied to various graph-based applications. Despite their success, designing good GNN architectures is non-trivial, which heavily relies on lots of human efforts and domain knowledge. Although several attempts have been made in graph neural architecture search, they suffer from the following limitations: 1) fixed pipeline pattern of propagation (P) and (T) transformation operations; 2) restricted pipeline depth of GNN architectures. This paper proposes DFG-NAS, a novel method that searches for deep and flexible GNN architectures. Unlike most existing methods that focus on micro-architecture, DFG-NAS highlights another level of design: the search for macro-architectures of how atomic P and T are integrated and organized into a GNN. Concretely, DFG-NAS proposes a novel-designed search space for the P-T permutations and combinations based on the message-passing dis-aggregation, and defines various mutation strategies and employs the evolutionary algorithm to conduct an efficient and effective search. Empirical studies on four benchmark datasets demonstrate that DFG-NAS could find more powerful architectures than state-of-the-art manual designs and meanwhile are more efficient than the current graph neural architecture search approaches.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22s/zhang22s.pdf", "supp": "", "pdf_size": 289497, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12933675562277145349&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": "School of CS & Key Laboratory of High Confidence Software Technologies, Peking University; School of CS & Key Laboratory of High Confidence Software Technologies, Peking University; School of CS & Key Laboratory of High Confidence Software Technologies, Peking University; School of CS & Key Laboratory of High Confidence Software Technologies, Peking University; School of CS & Key Laboratory of High Confidence Software Technologies, Peking University + Institute of Computational Social Science, Peking University (Qingdao); School of CS & Key Laboratory of High Confidence Software Technologies, Peking University + Institute of Computational Social Science, Peking University (Qingdao)", "aff_domain": "pku.edu.cn;pku.edu.cn; ; ; ; ", "email": "pku.edu.cn;pku.edu.cn; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhang22s.html", "aff_unique_index": "0;0;0;0;0+0;0+0", "aff_unique_norm": "Peking University", "aff_unique_dep": "School of CS & Key Laboratory of High Confidence Software Technologies", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "PKU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Qingdao", "aff_country_unique_index": "0;0;0;0;0+0;0+0", "aff_country_unique": "China" }, { "title": "Deep equilibrium networks are sensitive to initialization statistics", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16465", "id": "16465", "proceeding": "https://proceedings.mlr.press/v162/agarwala22a.html", "poster": "", "slides": "", "author_site": "Atish Agarwala, Samuel Schoenholz", "author": "Atish Agarwala; Samuel S Schoenholz", "abstract": "Deep equilibrium networks (DEQs) are a promising way to construct models which trade off memory for compute. However, theoretical understanding of these models is still lacking compared to traditional networks, in part because of the repeated application of a single set of weights. We show that DEQs are sensitive to the higher order statistics of the matrix families from which they are initialized. In particular, initializing with orthogonal or symmetric matrices allows for greater stability in training. This gives us a practical prescription for initializations which allow for training with a broader range of initial weight scales.", "bibtex": "@InProceedings{pmlr-v162-agarwala22a,\n title = \t {Deep equilibrium networks are sensitive to initialization statistics},\n author = {Agarwala, Atish and Schoenholz, Samuel S},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {136--160},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/agarwala22a/agarwala22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/agarwala22a.html},\n abstract = \t {Deep equilibrium networks (DEQs) are a promising way to construct models which trade off memory for compute. However, theoretical understanding of these models is still lacking compared to traditional networks, in part because of the repeated application of a single set of weights. We show that DEQs are sensitive to the higher order statistics of the matrix families from which they are initialized. In particular, initializing with orthogonal or symmetric matrices allows for greater stability in training. This gives us a practical prescription for initializations which allow for training with a broader range of initial weight scales.}\n}", "pdf": "https://proceedings.mlr.press/v162/agarwala22a/agarwala22a.pdf", "supp": "", "pdf_size": 559261, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8013070763753790789&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Google Research, Brain Team; Google Research, Brain Team", "aff_domain": "google.com;google.com", "email": "google.com;google.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/agarwala22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Deep symbolic regression for recurrence prediction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16219", "id": "16219", "proceeding": "https://proceedings.mlr.press/v162/d-ascoli22a.html", "poster": "/media/PosterPDFs/ICML%202022/792c7b5aae4a79e78aaeda80516ae2ac.png?t=1657976882.4884746", "slides": "", "author_site": "St\u00e9phane d'Ascoli, Pierre-Alexandre Kamienny, Guillaume Lample, Francois Charton", "author": "St\u00e9phane D\u2019Ascoli; Pierre-Alexandre Kamienny; Guillaume Lample; Francois Charton", "abstract": "Symbolic regression, i.e. predicting a function from the observation of its values, is well-known to be a challenging task. In this paper, we train Transformers to infer the function or recurrence relation underlying sequences of integers or floats, a typical task in human IQ tests which has hardly been tackled in the machine learning literature. We evaluate our integer model on a subset of OEIS sequences, and show that it outperforms built-in Mathematica functions for recurrence prediction. We also demonstrate that our float model is able to yield informative approximations of out-of-vocabulary functions and constants, e.g. $\\operatorname{bessel0}(x)\\approx \\frac{\\sin(x)+\\cos(x)}{\\sqrt{\\pi x}}$ and $1.644934\\approx \\pi^2/6$.", "bibtex": "@InProceedings{pmlr-v162-d-ascoli22a,\n title = \t {Deep symbolic regression for recurrence prediction},\n author = {D'Ascoli, St{\\'e}phane and Kamienny, Pierre-Alexandre and Lample, Guillaume and Charton, Francois},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4520--4536},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/d-ascoli22a/d-ascoli22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/d-ascoli22a.html},\n abstract = \t {Symbolic regression, i.e. predicting a function from the observation of its values, is well-known to be a challenging task. In this paper, we train Transformers to infer the function or recurrence relation underlying sequences of integers or floats, a typical task in human IQ tests which has hardly been tackled in the machine learning literature. We evaluate our integer model on a subset of OEIS sequences, and show that it outperforms built-in Mathematica functions for recurrence prediction. We also demonstrate that our float model is able to yield informative approximations of out-of-vocabulary functions and constants, e.g. $\\operatorname{bessel0}(x)\\approx \\frac{\\sin(x)+\\cos(x)}{\\sqrt{\\pi x}}$ and $1.644934\\approx \\pi^2/6$.}\n}", "pdf": "https://proceedings.mlr.press/v162/d-ascoli22a/d-ascoli22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/d'ascoli22a-supp.zip", "pdf_size": 1597429, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3332401850828259360&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Department of Physics, \u00b4Ecole Normale Sup\u00b4erieure, Paris + Meta AI, Paris; Meta AI, Paris + Laboratoire d\u2019Informatique de Paris 6, Sorbonne Universit \u00b4e, Paris; Meta AI, Paris; Meta AI, Paris", "aff_domain": "ens.fr; ; ; ", "email": "ens.fr; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/d-ascoli22a.html", "aff_unique_index": "0+1;1+2;1;1", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure;Meta;Sorbonne Universit\u00e9", "aff_unique_dep": "Department of Physics;Meta AI;Laboratoire d\u2019Informatique de Paris 6", "aff_unique_url": "https://www.ens.fr;https://meta.ai;https://www.sorbonne-universite.fr", "aff_unique_abbr": "ENS;Meta AI;Sorbonne", "aff_campus_unique_index": "0+0;0+0;0;0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0+0;0+0;0;0", "aff_country_unique": "France" }, { "title": "DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16671", "id": "16671", "proceeding": "https://proceedings.mlr.press/v162/rajbhandari22a.html", "poster": "/media/PosterPDFs/ICML%202022/ad8d3a0a0f0a084a97fad357c649438c.png?t=1657690598.4724479", "slides": "", "author_site": "Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He", "author": "Samyam Rajbhandari; Conglong Li; Zhewei Yao; Minjia Zhang; Reza Yazdani Aminabadi; Ammar Ahmad Awan; Jeff Rasley; Yuxiong He", "abstract": "As the training of giant dense models hits the boundary on the availability and capability of the hardware resources today, Mixture-of-Experts (MoE) models have become one of the most promising model architectures due to their significant training cost reduction compared to quality-equivalent dense models. Their training cost saving is demonstrated from encoder-decoder models (prior works) to a 5x saving for auto-aggressive language models (this work). However, due to the much larger model size and unique architecture, how to provide fast MoE model inference remains challenging and unsolved, limiting their practical usage. To tackle this, we present DeepSpeed-MoE, an end-to-end MoE training and inference solution, including novel MoE architecture designs and model compression techniques that reduce MoE model size by up to 3.7x, and a highly optimized inference system that provides 7.3x better latency and cost compared to existing MoE inference solutions. DeepSpeed-MoE offers an unprecedented scale and efficiency to serve massive MoE models with up to 4.5x faster and 9x cheaper inference compared to quality-equivalent dense models. We hope our innovations and systems help open a promising path to new directions in the large model landscape, a shift from dense to sparse MoE models, where training and deploying higher-quality models with fewer resources becomes more widely possible.", "bibtex": "@InProceedings{pmlr-v162-rajbhandari22a,\n title = \t {{D}eep{S}peed-{M}o{E}: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation {AI} Scale},\n author = {Rajbhandari, Samyam and Li, Conglong and Yao, Zhewei and Zhang, Minjia and Aminabadi, Reza Yazdani and Awan, Ammar Ahmad and Rasley, Jeff and He, Yuxiong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18332--18346},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rajbhandari22a/rajbhandari22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rajbhandari22a.html},\n abstract = \t {As the training of giant dense models hits the boundary on the availability and capability of the hardware resources today, Mixture-of-Experts (MoE) models have become one of the most promising model architectures due to their significant training cost reduction compared to quality-equivalent dense models. Their training cost saving is demonstrated from encoder-decoder models (prior works) to a 5x saving for auto-aggressive language models (this work). However, due to the much larger model size and unique architecture, how to provide fast MoE model inference remains challenging and unsolved, limiting their practical usage. To tackle this, we present DeepSpeed-MoE, an end-to-end MoE training and inference solution, including novel MoE architecture designs and model compression techniques that reduce MoE model size by up to 3.7x, and a highly optimized inference system that provides 7.3x better latency and cost compared to existing MoE inference solutions. DeepSpeed-MoE offers an unprecedented scale and efficiency to serve massive MoE models with up to 4.5x faster and 9x cheaper inference compared to quality-equivalent dense models. We hope our innovations and systems help open a promising path to new directions in the large model landscape, a shift from dense to sparse MoE models, where training and deploying higher-quality models with fewer resources becomes more widely possible.}\n}", "pdf": "https://proceedings.mlr.press/v162/rajbhandari22a/rajbhandari22a.pdf", "supp": "", "pdf_size": 979512, "gs_citation": 315, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6450094276419504510&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Microsoft; Microsoft; Microsoft; Microsoft; Microsoft; Microsoft; Microsoft; Microsoft", "aff_domain": "microsoft.com; ; ; ; ; ; ;microsoft.com", "email": "microsoft.com; ; ; ; ; ; ;microsoft.com", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/rajbhandari22a.html", "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Corporation", "aff_unique_url": "https://www.microsoft.com", "aff_unique_abbr": "Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Delay-Adaptive Step-sizes for Asynchronous Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16177", "id": "16177", "proceeding": "https://proceedings.mlr.press/v162/wu22g.html", "poster": "/media/PosterPDFs/ICML%202022/5a29503a4909fcade36b1823e7cebcf5_hgwEhlC.png?t=1657182406.8165128", "slides": "/media/icml-2022/Slides/16177.pdf", "author_site": "Xuyang Wu, Sindri Magnusson, Hamid Reza Feyzmahdavian, Mikael Johansson", "author": "Xuyang Wu; Sindri Magnusson; Hamid Reza Feyzmahdavian; Mikael Johansson", "abstract": "In scalable machine learning systems, model training is often parallelized over multiple nodes that run without tight synchronization. Most analysis results for the related asynchronous algorithms use an upper bound on the information delays in the system to determine learning rates. Not only are such bounds hard to obtain in advance, but they also result in unnecessarily slow convergence. In this paper, we show that it is possible to use learning rates that depend on the actual time-varying delays in the system. We develop general convergence results for delay-adaptive asynchronous iterations and specialize these to proximal incremental gradient descent and block coordinate descent algorithms. For each of these methods, we demonstrate how delays can be measured on-line, present delay-adaptive step-size policies, and illustrate their theoretical and practical advantages over the state-of-the-art.", "bibtex": "@InProceedings{pmlr-v162-wu22g,\n title = \t {Delay-Adaptive Step-sizes for Asynchronous Learning},\n author = {Wu, Xuyang and Magnusson, Sindri and Feyzmahdavian, Hamid Reza and Johansson, Mikael},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24093--24113},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22g/wu22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22g.html},\n abstract = \t {In scalable machine learning systems, model training is often parallelized over multiple nodes that run without tight synchronization. Most analysis results for the related asynchronous algorithms use an upper bound on the information delays in the system to determine learning rates. Not only are such bounds hard to obtain in advance, but they also result in unnecessarily slow convergence. In this paper, we show that it is possible to use learning rates that depend on the actual time-varying delays in the system. We develop general convergence results for delay-adaptive asynchronous iterations and specialize these to proximal incremental gradient descent and block coordinate descent algorithms. For each of these methods, we demonstrate how delays can be measured on-line, present delay-adaptive step-size policies, and illustrate their theoretical and practical advantages over the state-of-the-art.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22g/wu22g.pdf", "supp": "", "pdf_size": 589812, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12034073135647458504&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Division of Decision and Control Systems, EECS, KTH Royal Institute of Technology, Stockholm, Sweden; Department of Computer and System Science, Stockholm University, Stockholm, Sweden; ABB Corporate Research, V \u00a8aster \u02daas, Sweden; Division of Decision and Control Systems, EECS, KTH Royal Institute of Technology, Stockholm, Sweden", "aff_domain": "kth.se; ; ; ", "email": "kth.se; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/wu22g.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "KTH Royal Institute of Technology;Stockholm University;ABB Corporate Research", "aff_unique_dep": "Division of Decision and Control Systems;Department of Computer and System Science;", "aff_unique_url": "https://www.kth.se;https://www.su.se;https://new.abb.com/research", "aff_unique_abbr": "KTH;SU;ABB", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stockholm;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Sweden" }, { "title": "Delayed Reinforcement Learning by Imitation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18063", "id": "18063", "proceeding": "https://proceedings.mlr.press/v162/liotet22a.html", "poster": "/media/PosterPDFs/ICML%202022/bca82e41ee7b0833588399b1fcd177c7.png?t=1657279125.0689442", "slides": "", "author_site": "Pierre Liotet, Davide Maran, Lorenzo Bisi, Marcello Restelli", "author": "Pierre Liotet; Davide Maran; Lorenzo Bisi; Marcello Restelli", "abstract": "When the agent\u2019s observations or interactions are delayed, classic reinforcement learning tools usually fail. In this paper, we propose a simple yet new and efficient solution to this problem. We assume that, in the undelayed environment, an efficient policy is known or can be easily learnt, but the task may suffer from delays in practice and we thus want to take them into account. We present a novel algorithm, Delayed Imitation with Dataset Aggregation (DIDA), which builds upon imitation learning methods to learn how to act in a delayed environment from undelayed demonstrations. We provide a theoretical analysis of the approach that will guide the practical design of DIDA. These results are also of general interest in the delayed reinforcement learning literature by providing bounds on the performance between delayed and undelayed tasks, under smoothness conditions. We show empirically that DIDA obtains high performances with a remarkable sample efficiency on a variety of tasks, including robotic locomotion, classic control, and trading.", "bibtex": "@InProceedings{pmlr-v162-liotet22a,\n title = \t {Delayed Reinforcement Learning by Imitation},\n author = {Liotet, Pierre and Maran, Davide and Bisi, Lorenzo and Restelli, Marcello},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13528--13556},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liotet22a/liotet22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/liotet22a.html},\n abstract = \t {When the agent\u2019s observations or interactions are delayed, classic reinforcement learning tools usually fail. In this paper, we propose a simple yet new and efficient solution to this problem. We assume that, in the undelayed environment, an efficient policy is known or can be easily learnt, but the task may suffer from delays in practice and we thus want to take them into account. We present a novel algorithm, Delayed Imitation with Dataset Aggregation (DIDA), which builds upon imitation learning methods to learn how to act in a delayed environment from undelayed demonstrations. We provide a theoretical analysis of the approach that will guide the practical design of DIDA. These results are also of general interest in the delayed reinforcement learning literature by providing bounds on the performance between delayed and undelayed tasks, under smoothness conditions. We show empirically that DIDA obtains high performances with a remarkable sample efficiency on a variety of tasks, including robotic locomotion, classic control, and trading.}\n}", "pdf": "https://proceedings.mlr.press/v162/liotet22a/liotet22a.pdf", "supp": "", "pdf_size": 852292, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13184718733220674095&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Politecnico di Milano; Politecnico di Milano; Politecnico di Milano; Politecnico di Milano", "aff_domain": "polimi.it; ; ; ", "email": "polimi.it; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/liotet22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Politecnico di Milano", "aff_unique_dep": "", "aff_unique_url": "https://www.polimi.it", "aff_unique_abbr": "Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Italy" }, { "title": "Deletion Robust Submodular Maximization over Matroids", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17357", "id": "17357", "proceeding": "https://proceedings.mlr.press/v162/duetting22a.html", "poster": "", "slides": "", "author_site": "PAUL DUETTING, Federico Fusco, Silvio Lattanzi, Ashkan Norouzi-Fard, Morteza Zadimoghaddam", "author": "Paul Duetting; Federico Fusco; Silvio Lattanzi; Ashkan Norouzi-Fard; Morteza Zadimoghaddam", "abstract": "Maximizing a monotone submodular function is a fundamental task in machine learning. In this paper we study the deletion robust version of the problem under the classic matroids constraint. Here the goal is to extract a small size summary of the dataset that contains a high value independent set even after an adversary deleted some elements. We present constant-factor approximation algorithms, whose space complexity depends on the rank $k$ of the matroid and the number $d$ of deleted elements. In the centralized setting we present a $(3.582+O(\\varepsilon))$-approximation algorithm with summary size $O(k + \\frac{d}{\\eps^2}\\log \\frac{k}{\\eps})$. In the streaming setting we provide a $(5.582+O(\\varepsilon))$-approximation algorithm with summary size and memory $O(k + \\frac{d}{\\eps^2}\\log \\frac{k}{\\eps})$. We complement our theoretical results with an in-depth experimental analysis showing the effectiveness of our algorithms on real-world datasets.", "bibtex": "@InProceedings{pmlr-v162-duetting22a,\n title = \t {Deletion Robust Submodular Maximization over Matroids},\n author = {Duetting, Paul and Fusco, Federico and Lattanzi, Silvio and Norouzi-Fard, Ashkan and Zadimoghaddam, Morteza},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5671--5693},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/duetting22a/duetting22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/duetting22a.html},\n abstract = \t {Maximizing a monotone submodular function is a fundamental task in machine learning. In this paper we study the deletion robust version of the problem under the classic matroids constraint. Here the goal is to extract a small size summary of the dataset that contains a high value independent set even after an adversary deleted some elements. We present constant-factor approximation algorithms, whose space complexity depends on the rank $k$ of the matroid and the number $d$ of deleted elements. In the centralized setting we present a $(3.582+O(\\varepsilon))$-approximation algorithm with summary size $O(k + \\frac{d}{\\eps^2}\\log \\frac{k}{\\eps})$. In the streaming setting we provide a $(5.582+O(\\varepsilon))$-approximation algorithm with summary size and memory $O(k + \\frac{d}{\\eps^2}\\log \\frac{k}{\\eps})$. We complement our theoretical results with an in-depth experimental analysis showing the effectiveness of our algorithms on real-world datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/duetting22a/duetting22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/duetting22a-supp.zip", "pdf_size": 2235789, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=411303238318231156&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Google Research, Z \u00a8urich, Switzerland; Department of Computer, Control and Management Engineering \u201cAntonio Ruberti\u201d, Sapienza University of Rome, Rome, Italy + Google Research, Z \u00a8urich, Switzerland; Google Research, Z \u00a8urich, Switzerland; Google Research, Z \u00a8urich, Switzerland; Google Research, Z \u00a8urich, Switzerland", "aff_domain": "diag.uniroma1.it; ; ; ; ", "email": "diag.uniroma1.it; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/duetting22a.html", "aff_unique_index": "0;1+0;0;0;0", "aff_unique_norm": "Google;Sapienza University of Rome", "aff_unique_dep": "Google Research;Department of Computer, Control and Management Engineering \"Antonio Ruberti\"", "aff_unique_url": "https://research.google;https://www.uniroma1.it", "aff_unique_abbr": "Google Res.;Sapienza", "aff_campus_unique_index": "0;1+0;0;0;0", "aff_campus_unique": "Z\u00fcrich;Rome", "aff_country_unique_index": "0;1+0;0;0;0", "aff_country_unique": "Switzerland;Italy" }, { "title": "Demystifying the Adversarial Robustness of Random Transformation Defenses", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18177", "id": "18177", "proceeding": "https://proceedings.mlr.press/v162/sitawarin22a.html", "poster": "/media/PosterPDFs/ICML%202022/9e7ba617ad9e69b39bd0c29335b79629.png?t=1657954445.6184723", "slides": "/media/icml-2022/Slides/18177.pdf", "author_site": "Chawin Sitawarin, Zachary Golan-Strieb, David Wagner", "author": "Chawin Sitawarin; Zachary J Golan-Strieb; David Wagner", "abstract": "Neural networks\u2019 lack of robustness against attacks raises concerns in security-sensitive settings such as autonomous vehicles. While many countermeasures may look promising, only a few withstand rigorous evaluation. Defenses using random transformations (RT) have shown impressive results, particularly BaRT (Raff et al., 2019) on ImageNet. However, this type of defense has not been rigorously evaluated, leaving its robustness properties poorly understood. Their stochastic properties make evaluation more challenging and render many proposed attacks on deterministic models inapplicable. First, we show that the BPDA attack (Athalye et al., 2018a) used in BaRT\u2019s evaluation is ineffective and likely overestimates its robustness. We then attempt to construct the strongest possible RT defense through the informed selection of transformations and Bayesian optimization for tuning their parameters. Furthermore, we create the strongest possible attack to evaluate our RT defense. Our new attack vastly outperforms the baseline, reducing the accuracy by 83% compared to the 19% reduction by the commonly used EoT attack ($4.3\\times$ improvement). Our result indicates that the RT defense on the Imagenette dataset (a ten-class subset of ImageNet) is not robust against adversarial examples. Extending the study further, we use our new attack to adversarially train RT defense (called AdvRT), resulting in a large robustness gain. Code is available at https://github.com/wagnergroup/demystify-random-transform.", "bibtex": "@InProceedings{pmlr-v162-sitawarin22a,\n title = \t {Demystifying the Adversarial Robustness of Random Transformation Defenses},\n author = {Sitawarin, Chawin and Golan-Strieb, Zachary J and Wagner, David},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20232--20252},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sitawarin22a/sitawarin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sitawarin22a.html},\n abstract = \t {Neural networks\u2019 lack of robustness against attacks raises concerns in security-sensitive settings such as autonomous vehicles. While many countermeasures may look promising, only a few withstand rigorous evaluation. Defenses using random transformations (RT) have shown impressive results, particularly BaRT (Raff et al., 2019) on ImageNet. However, this type of defense has not been rigorously evaluated, leaving its robustness properties poorly understood. Their stochastic properties make evaluation more challenging and render many proposed attacks on deterministic models inapplicable. First, we show that the BPDA attack (Athalye et al., 2018a) used in BaRT\u2019s evaluation is ineffective and likely overestimates its robustness. We then attempt to construct the strongest possible RT defense through the informed selection of transformations and Bayesian optimization for tuning their parameters. Furthermore, we create the strongest possible attack to evaluate our RT defense. Our new attack vastly outperforms the baseline, reducing the accuracy by 83% compared to the 19% reduction by the commonly used EoT attack ($4.3\\times$ improvement). Our result indicates that the RT defense on the Imagenette dataset (a ten-class subset of ImageNet) is not robust against adversarial examples. Extending the study further, we use our new attack to adversarially train RT defense (called AdvRT), resulting in a large robustness gain. Code is available at https://github.com/wagnergroup/demystify-random-transform.}\n}", "pdf": "https://proceedings.mlr.press/v162/sitawarin22a/sitawarin22a.pdf", "supp": "", "pdf_size": 6404452, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6394427111079703523&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Electrical Engineering and Computer Sciences, University of California, Berkeley; Department of Electrical Engineering and Computer Sciences, University of California, Berkeley; Department of Electrical Engineering and Computer Sciences, University of California, Berkeley", "aff_domain": "berkeley.edu; ; ", "email": "berkeley.edu; ; ", "github": "https://github.com/wagner-group/demystify-random-transform", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sitawarin22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Department of Electrical Engineering and Computer Sciences", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Denoised MDPs: Learning World Models Better Than the World Itself", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17443", "id": "17443", "proceeding": "https://proceedings.mlr.press/v162/wang22c.html", "poster": "/media/PosterPDFs/ICML%202022/07563a3fe3bbe7e3ba84431ad9d055af.png?t=1658064916.8247142", "slides": "", "author_site": "Tongzhou Wang, Simon Du, Antonio Torralba, Phillip Isola, Amy Zhang, Yuandong Tian", "author": "Tongzhou Wang; Simon Du; Antonio Torralba; Phillip Isola; Amy Zhang; Yuandong Tian", "abstract": "The ability to separate signal from noise, and reason with clean abstractions, is critical to intelligence. With this ability, humans can efficiently perform real world tasks without considering all possible nuisance factors. How can artificial agents do the same? What kind of information can agents safely discard as noises? In this work, we categorize information out in the wild into four types based on controllability and relation with reward, and formulate useful information as that which is both controllable and reward-relevant. This framework clarifies the kinds information removed by various prior work on representation learning in reinforcement learning (RL), and leads to our proposed approach of learning a Denoised MDP that explicitly factors out certain noise distractors. Extensive experiments on variants of DeepMind Control Suite and RoboDesk demonstrate superior performance of our denoised world model over using raw observations alone, and over prior works, across policy optimization control tasks as well as the non-control task of joint position regression. Project Page: https://ssnl.github.io/denoised_mdp/ Code: https://github.com/facebookresearch/denoised_mdp/", "bibtex": "@InProceedings{pmlr-v162-wang22c,\n title = \t {Denoised {MDP}s: Learning World Models Better Than the World Itself},\n author = {Wang, Tongzhou and Du, Simon and Torralba, Antonio and Isola, Phillip and Zhang, Amy and Tian, Yuandong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22591--22612},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22c/wang22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22c.html},\n abstract = \t {The ability to separate signal from noise, and reason with clean abstractions, is critical to intelligence. With this ability, humans can efficiently perform real world tasks without considering all possible nuisance factors. How can artificial agents do the same? What kind of information can agents safely discard as noises? In this work, we categorize information out in the wild into four types based on controllability and relation with reward, and formulate useful information as that which is both controllable and reward-relevant. This framework clarifies the kinds information removed by various prior work on representation learning in reinforcement learning (RL), and leads to our proposed approach of learning a Denoised MDP that explicitly factors out certain noise distractors. Extensive experiments on variants of DeepMind Control Suite and RoboDesk demonstrate superior performance of our denoised world model over using raw observations alone, and over prior works, across policy optimization control tasks as well as the non-control task of joint position regression. Project Page: https://ssnl.github.io/denoised_mdp/ Code: https://github.com/facebookresearch/denoised_mdp/}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22c/wang22c.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wang22c-supp.zip", "pdf_size": 9097077, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4094945741122544681&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "MIT CSAIL; University of Washington; MIT CSAIL; MIT CSAIL; UC Berkeley + Meta AI; Meta AI", "aff_domain": "mit.edu; ; ; ; ; ", "email": "mit.edu; ; ; ; ; ", "github": "github.com/facebookresearch/denoised_mdp", "project": "ssnl.github.io/denoised_mdp", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wang22c.html", "aff_unique_index": "0;1;0;0;2+3;3", "aff_unique_norm": "Massachusetts Institute of Technology;University of Washington;University of California, Berkeley;Meta", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;;;Meta AI", "aff_unique_url": "https://www.csail.mit.edu;https://www.washington.edu;https://www.berkeley.edu;https://meta.com", "aff_unique_abbr": "MIT CSAIL;UW;UC Berkeley;Meta", "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Cambridge;;Berkeley", "aff_country_unique_index": "0;0;0;0;0+0;0", "aff_country_unique": "United States" }, { "title": "Deploying Convolutional Networks on Untrusted Platforms Using 2D Holographic Reduced Representations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16461", "id": "16461", "proceeding": "https://proceedings.mlr.press/v162/alam22a.html", "poster": "/media/PosterPDFs/ICML%202022/4454c95ca2b2b298057cbcb3bdcbb566_4zMYDLB.png?t=1657203176.9356089", "slides": "", "author_site": "Mohammad Mahmudul Alam, Edward Raff, Tim Oates, James Holt", "author": "Mohammad Mahmudul Alam; Edward Raff; Tim Oates; James Holt", "abstract": "Due to the computational cost of running inference for a neural network, the need to deploy the inferential steps on a third party\u2019s compute environment or hardware is common. If the third party is not fully trusted, it is desirable to obfuscate the nature of the inputs and outputs, so that the third party can not easily determine what specific task is being performed. Provably secure protocols for leveraging an untrusted party exist but are too computational demanding to run in practice. We instead explore a different strategy of fast, heuristic security that we call", "bibtex": "@InProceedings{pmlr-v162-alam22a,\n title = \t {Deploying Convolutional Networks on Untrusted Platforms Using 2{D} Holographic Reduced Representations},\n author = {Alam, Mohammad Mahmudul and Raff, Edward and Oates, Tim and Holt, James},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {367--393},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/alam22a/alam22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/alam22a.html},\n abstract = \t {Due to the computational cost of running inference for a neural network, the need to deploy the inferential steps on a third party\u2019s compute environment or hardware is common. If the third party is not fully trusted, it is desirable to obfuscate the nature of the inputs and outputs, so that the third party can not easily determine what specific task is being performed. Provably secure protocols for leveraging an untrusted party exist but are too computational demanding to run in practice. We instead explore a different strategy of fast, heuristic security that we call", "pdf": "https://proceedings.mlr.press/v162/alam22a/alam22a.pdf", "supp": "", "pdf_size": 16081840, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7363780369551842627&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science and Electrical Engineering, University of Maryland, Baltimore County, Baltimore, MD, USA+Laboratory for Physical Sciences, College Park, MD, USA+Booz Allen Hamilton, McLean, VA, USA; Department of Computer Science and Electrical Engineering, University of Maryland, Baltimore County, Baltimore, MD, USA+Laboratory for Physical Sciences, College Park, MD, USA+Booz Allen Hamilton, McLean, VA, USA; Department of Computer Science and Electrical Engineering, University of Maryland, Baltimore County, Baltimore, MD, USA; Laboratory for Physical Sciences, College Park, MD, USA", "aff_domain": "bah.com;cs.umbc.edu; ; ", "email": "bah.com;cs.umbc.edu; ; ", "github": "https://github.com/NeuromorphicComputationResearchProgram/Connectionist-Symbolic-Pseudo-Secrets", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/alam22a.html", "aff_unique_index": "0+1+2;0+1+2;0;1", "aff_unique_norm": "University of Maryland, Baltimore County;Laboratory for Physical Sciences;Booz Allen Hamilton", "aff_unique_dep": "Department of Computer Science and Electrical Engineering;;", "aff_unique_url": "https://www.umbc.edu;;https://www.boozallen.com", "aff_unique_abbr": "UMBC;;", "aff_campus_unique_index": "0+1+2;0+1+2;0;1", "aff_campus_unique": "Baltimore;College Park;McLean", "aff_country_unique_index": "0+0+0;0+0+0;0;0", "aff_country_unique": "United States" }, { "title": "DepthShrinker: A New Compression Paradigm Towards Boosting Real-Hardware Efficiency of Compact Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17549", "id": "17549", "proceeding": "https://proceedings.mlr.press/v162/fu22c.html", "poster": "/media/PosterPDFs/ICML%202022/889091ff744069cab08dc605d162a8d3.png?t=1657518246.0224702", "slides": "", "author_site": "Yonggan Fu, Haichuan Yang, Jiayi Yuan, Meng Li, Cheng Wan, Raghuraman Krishnamoorthi, Vikas Chandra, Yingyan Lin", "author": "Yonggan Fu; Haichuan Yang; Jiayi Yuan; Meng Li; Cheng Wan; Raghuraman Krishnamoorthi; Vikas Chandra; Yingyan Lin", "abstract": "Efficient deep neural network (DNN) models equipped with compact operators (e.g., depthwise convolutions) have shown great potential in reducing DNNs\u2019 theoretical complexity (e.g., the total number of weights/operations) while maintaining a decent model accuracy. However, existing efficient DNNs are still limited in fulfilling their promise in boosting real-hardware efficiency, due to their commonly adopted compact operators\u2019 low hardware utilization. In this work, we open up a new compression paradigm for developing real-hardware efficient DNNs, leading to boosted hardware efficiency while maintaining model accuracy. Interestingly, we observe that while some DNN layers\u2019 activation functions help DNNs\u2019 training optimization and achievable accuracy, they can be properly removed after training without compromising the model accuracy. Inspired by this observation, we propose a framework dubbed DepthShrinker, which develops hardware-friendly compact networks via shrinking the basic building blocks of existing efficient DNNs that feature irregular computation patterns into dense ones with much improved hardware utilization and thus real-hardware efficiency. Excitingly, our DepthShrinker framework delivers hardware-friendly compact networks that outperform both state-of-the-art efficient DNNs and compression techniques, e.g., a 3.06% higher accuracy and 1.53x throughput on Tesla V100 over SOTA channel-wise pruning method MetaPruning. Our codes are available at: https://github.com/facebookresearch/DepthShrinker.", "bibtex": "@InProceedings{pmlr-v162-fu22c,\n title = \t {{D}epth{S}hrinker: A New Compression Paradigm Towards Boosting Real-Hardware Efficiency of Compact Neural Networks},\n author = {Fu, Yonggan and Yang, Haichuan and Yuan, Jiayi and Li, Meng and Wan, Cheng and Krishnamoorthi, Raghuraman and Chandra, Vikas and Lin, Yingyan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6849--6862},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fu22c/fu22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/fu22c.html},\n abstract = \t {Efficient deep neural network (DNN) models equipped with compact operators (e.g., depthwise convolutions) have shown great potential in reducing DNNs\u2019 theoretical complexity (e.g., the total number of weights/operations) while maintaining a decent model accuracy. However, existing efficient DNNs are still limited in fulfilling their promise in boosting real-hardware efficiency, due to their commonly adopted compact operators\u2019 low hardware utilization. In this work, we open up a new compression paradigm for developing real-hardware efficient DNNs, leading to boosted hardware efficiency while maintaining model accuracy. Interestingly, we observe that while some DNN layers\u2019 activation functions help DNNs\u2019 training optimization and achievable accuracy, they can be properly removed after training without compromising the model accuracy. Inspired by this observation, we propose a framework dubbed DepthShrinker, which develops hardware-friendly compact networks via shrinking the basic building blocks of existing efficient DNNs that feature irregular computation patterns into dense ones with much improved hardware utilization and thus real-hardware efficiency. Excitingly, our DepthShrinker framework delivers hardware-friendly compact networks that outperform both state-of-the-art efficient DNNs and compression techniques, e.g., a 3.06% higher accuracy and 1.53x throughput on Tesla V100 over SOTA channel-wise pruning method MetaPruning. Our codes are available at: https://github.com/facebookresearch/DepthShrinker.}\n}", "pdf": "https://proceedings.mlr.press/v162/fu22c/fu22c.pdf", "supp": "", "pdf_size": 500040, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13003128521759488248&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Electrical and Computer Engineering, Rice University; Meta Inc.; Department of Electrical and Computer Engineering, Rice University; Meta Inc.; Department of Electrical and Computer Engineering, Rice University; Meta Inc.; Meta Inc.; Department of Electrical and Computer Engineering, Rice University", "aff_domain": "rice.edu; ; ; ; ; ; ;rice.edu", "email": "rice.edu; ; ; ; ; ; ;rice.edu", "github": "https://github.com/facebookresearch/DepthShrinker", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/fu22c.html", "aff_unique_index": "0;1;0;1;0;1;1;0", "aff_unique_norm": "Rice University;Meta", "aff_unique_dep": "Department of Electrical and Computer Engineering;Meta Platforms, Inc.", "aff_unique_url": "https://www.rice.edu;https://www.meta.com", "aff_unique_abbr": "Rice;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Describing Differences between Text Distributions with Natural Language", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16585", "id": "16585", "proceeding": "https://proceedings.mlr.press/v162/zhong22a.html", "poster": "", "slides": "", "author_site": "Ruiqi Zhong, Charlie Snell, Dan Klein, Jacob Steinhardt", "author": "Ruiqi Zhong; Charlie Snell; Dan Klein; Jacob Steinhardt", "abstract": "How do two", "bibtex": "@InProceedings{pmlr-v162-zhong22a,\n title = \t {Describing Differences between Text Distributions with Natural Language},\n author = {Zhong, Ruiqi and Snell, Charlie and Klein, Dan and Steinhardt, Jacob},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27099--27116},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhong22a/zhong22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhong22a.html},\n abstract = \t {How do two", "pdf": "https://proceedings.mlr.press/v162/zhong22a/zhong22a.pdf", "supp": "", "pdf_size": 1644381, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12276789524717856994&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Computer Science Division, University of California, Berkeley; Computer Science Division, University of California, Berkeley; Computer Science Division, University of California, Berkeley; Computer Science Division, University of California, Berkeley", "aff_domain": "berkeley.edu; ; ; ", "email": "berkeley.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhong22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Computer Science Division", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Design-Bench: Benchmarks for Data-Driven Offline Model-Based Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17231", "id": "17231", "proceeding": "https://proceedings.mlr.press/v162/trabucco22a.html", "poster": "/media/PosterPDFs/ICML%202022/48c4a756d0ce842a039b2ee9e3f05174.png?t=1657997038.7787082", "slides": "", "author_site": "Brandon Trabucco, Xinyang Geng, Aviral Kumar, Sergey Levine", "author": "Brandon Trabucco; Xinyang Geng; Aviral Kumar; Sergey Levine", "abstract": "Black-box model-based optimization (MBO) problems, where the goal is to find a design input that maximizes an unknown objective function, are ubiquitous in a wide range of domains, such as the design of proteins, DNA sequences, aircraft, and robots. Solving model-based optimization problems typically requires actively querying the unknown objective function on design proposals, which means physically building the candidate molecule, aircraft, or robot, testing it, and storing the result. This process can be expensive and time consuming, and one might instead prefer to optimize for the best design using only the data one already has. This setting\u2014called offline MBO\u2014poses substantial and different algorithmic challenges than more commonly studied online techniques. A number of recent works have demonstrated success with offline MBO for high-dimensional optimization problems using high-capacity deep neural networks. However, the lack of standardized benchmarks in this emerging field is making progress difficult to track. To address this, we present Design-Bench, a benchmark for offline MBO with a unified evaluation protocol and reference implementations of recent methods. Our benchmark includes a suite of diverse and realistic tasks derived from real-world optimization problems in biology, materials science, and robotics that present distinct challenges for offline MBO. Our benchmark and reference implementations are released at github.com/rail-berkeley/design-bench and github.com/rail-berkeley/design-baselines.", "bibtex": "@InProceedings{pmlr-v162-trabucco22a,\n title = \t {Design-Bench: Benchmarks for Data-Driven Offline Model-Based Optimization},\n author = {Trabucco, Brandon and Geng, Xinyang and Kumar, Aviral and Levine, Sergey},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21658--21676},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/trabucco22a/trabucco22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/trabucco22a.html},\n abstract = \t {Black-box model-based optimization (MBO) problems, where the goal is to find a design input that maximizes an unknown objective function, are ubiquitous in a wide range of domains, such as the design of proteins, DNA sequences, aircraft, and robots. Solving model-based optimization problems typically requires actively querying the unknown objective function on design proposals, which means physically building the candidate molecule, aircraft, or robot, testing it, and storing the result. This process can be expensive and time consuming, and one might instead prefer to optimize for the best design using only the data one already has. This setting\u2014called offline MBO\u2014poses substantial and different algorithmic challenges than more commonly studied online techniques. A number of recent works have demonstrated success with offline MBO for high-dimensional optimization problems using high-capacity deep neural networks. However, the lack of standardized benchmarks in this emerging field is making progress difficult to track. To address this, we present Design-Bench, a benchmark for offline MBO with a unified evaluation protocol and reference implementations of recent methods. Our benchmark includes a suite of diverse and realistic tasks derived from real-world optimization problems in biology, materials science, and robotics that present distinct challenges for offline MBO. Our benchmark and reference implementations are released at github.com/rail-berkeley/design-bench and github.com/rail-berkeley/design-baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/trabucco22a/trabucco22a.pdf", "supp": "", "pdf_size": 2854947, "gs_citation": 111, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10179405789523427724&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "github.com/rail-berkeley/design-bench", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/trabucco22a.html" }, { "title": "Detached Error Feedback for Distributed SGD with Random Sparsification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16507", "id": "16507", "proceeding": "https://proceedings.mlr.press/v162/xu22c.html", "poster": "/media/PosterPDFs/ICML%202022/c361bc7b2c033a83d663b8d9fb4be56e.png?t=1657570092.3999586", "slides": "", "author_site": "An Xu, Heng Huang", "author": "An Xu; Heng Huang", "abstract": "The communication bottleneck has been a critical problem in large-scale distributed deep learning. In this work, we study distributed SGD with random block-wise sparsification as the gradient compressor, which is ring-allreduce compatible and highly computation-efficient but leads to inferior performance. To tackle this important issue, we improve the communication-efficient distributed SGD from a novel aspect, that is, the trade-off between the variance and second moment of the gradient. With this motivation, we propose a new detached error feedback (DEF) algorithm, which shows better convergence bound than error feedback for non-convex problems. We also propose DEF-A to accelerate the generalization of DEF at the early stages of the training, which shows better generalization bounds than DEF. Furthermore, we establish the connection between communication-efficient distributed SGD and SGD with iterate averaging (SGD-IA) for the first time. Extensive deep learning experiments show significant empirical improvement of the proposed methods under various settings. Our reproducible codes and scripts for all experiments in this work will be made publicly available.", "bibtex": "@InProceedings{pmlr-v162-xu22c,\n title = \t {Detached Error Feedback for Distributed {SGD} with Random Sparsification},\n author = {Xu, An and Huang, Heng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24550--24575},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22c/xu22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22c.html},\n abstract = \t {The communication bottleneck has been a critical problem in large-scale distributed deep learning. In this work, we study distributed SGD with random block-wise sparsification as the gradient compressor, which is ring-allreduce compatible and highly computation-efficient but leads to inferior performance. To tackle this important issue, we improve the communication-efficient distributed SGD from a novel aspect, that is, the trade-off between the variance and second moment of the gradient. With this motivation, we propose a new detached error feedback (DEF) algorithm, which shows better convergence bound than error feedback for non-convex problems. We also propose DEF-A to accelerate the generalization of DEF at the early stages of the training, which shows better generalization bounds than DEF. Furthermore, we establish the connection between communication-efficient distributed SGD and SGD with iterate averaging (SGD-IA) for the first time. Extensive deep learning experiments show significant empirical improvement of the proposed methods under various settings. Our reproducible codes and scripts for all experiments in this work will be made publicly available.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22c/xu22c.pdf", "supp": "", "pdf_size": 743302, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2556379652170131494&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Electrical and Computer Engineering, University of Pittsburgh, Pittsburgh, PA 15213, USA; Department of Electrical and Computer Engineering, University of Pittsburgh, Pittsburgh, PA 15213, USA", "aff_domain": "pitt.edu;pitt.edu", "email": "pitt.edu;pitt.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/xu22c.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Pittsburgh", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.pitt.edu", "aff_unique_abbr": "Pitt", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Detecting Adversarial Examples Is (Nearly) As Hard As Classifying Them", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16167", "id": "16167", "proceeding": "https://proceedings.mlr.press/v162/tramer22a.html", "poster": "/media/PosterPDFs/ICML%202022/3c333aadfc3ee8ecb8d77ee31197d96a.png?t=1657440489.5400035", "slides": "", "author": "Florian Tramer", "abstract": "Making classifiers robust to adversarial examples is challenging. Thus, many works tackle the seemingly easier task of", "bibtex": "@InProceedings{pmlr-v162-tramer22a,\n title = \t {Detecting Adversarial Examples Is ({N}early) As Hard As Classifying Them},\n author = {Tramer, Florian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21692--21702},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tramer22a/tramer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tramer22a.html},\n abstract = \t {Making classifiers robust to adversarial examples is challenging. Thus, many works tackle the seemingly easier task of", "pdf": "https://proceedings.mlr.press/v162/tramer22a/tramer22a.pdf", "supp": "", "pdf_size": 315419, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2383487011870063609&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "", "aff_domain": "", "email": "", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/tramer22a.html" }, { "title": "Detecting Corrupted Labels Without Training a Model to Predict", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17021", "id": "17021", "proceeding": "https://proceedings.mlr.press/v162/zhu22a.html", "poster": "/media/PosterPDFs/ICML%202022/46515dcd99ea50dd0671bc6840830404.png?t=1658242307.033621", "slides": "", "author_site": "Zhaowei Zhu, Zihao Dong, Yang Liu", "author": "Zhaowei Zhu; Zihao Dong; Yang Liu", "abstract": "Label noise in real-world datasets encodes wrong correlation patterns and impairs the generalization of deep neural networks (DNNs). It is critical to find efficient ways to detect corrupted patterns. Current methods primarily focus on designing robust training techniques to prevent DNNs from memorizing corrupted patterns. These approaches often require customized training processes and may overfit corrupted patterns, leading to a performance drop in detection. In this paper, from a more data-centric perspective, we propose a training-free solution to detect corrupted labels. Intuitively, \u201ccloser\u201d instances are more likely to share the same clean label. Based on the neighborhood information, we propose two methods: the first one uses \u201clocal voting\" via checking the noisy label consensuses of nearby features. The second one is a ranking-based approach that scores each instance and filters out a guaranteed number of instances that are likely to be corrupted. We theoretically analyze how the quality of features affects the local voting and provide guidelines for tuning neighborhood size. We also prove the worst-case error bound for the ranking-based method. Experiments with both synthetic and real-world label noise demonstrate our training-free solutions consistently and significantly improve most of the training-based baselines. Code is available at github.com/UCSC-REAL/SimiFeat.", "bibtex": "@InProceedings{pmlr-v162-zhu22a,\n title = \t {Detecting Corrupted Labels Without Training a Model to Predict},\n author = {Zhu, Zhaowei and Dong, Zihao and Liu, Yang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27412--27427},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhu22a/zhu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhu22a.html},\n abstract = \t {Label noise in real-world datasets encodes wrong correlation patterns and impairs the generalization of deep neural networks (DNNs). It is critical to find efficient ways to detect corrupted patterns. Current methods primarily focus on designing robust training techniques to prevent DNNs from memorizing corrupted patterns. These approaches often require customized training processes and may overfit corrupted patterns, leading to a performance drop in detection. In this paper, from a more data-centric perspective, we propose a training-free solution to detect corrupted labels. Intuitively, \u201ccloser\u201d instances are more likely to share the same clean label. Based on the neighborhood information, we propose two methods: the first one uses \u201clocal voting\" via checking the noisy label consensuses of nearby features. The second one is a ranking-based approach that scores each instance and filters out a guaranteed number of instances that are likely to be corrupted. We theoretically analyze how the quality of features affects the local voting and provide guidelines for tuning neighborhood size. We also prove the worst-case error bound for the ranking-based method. Experiments with both synthetic and real-world label noise demonstrate our training-free solutions consistently and significantly improve most of the training-based baselines. Code is available at github.com/UCSC-REAL/SimiFeat.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhu22a/zhu22a.pdf", "supp": "", "pdf_size": 515374, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9456693352680548441&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science and Engineering, University of California, Santa Cruz, CA, USA; Department of Computer Science and Engineering, University of California, Santa Cruz, CA, USA; Department of Computer Science and Engineering, University of California, Santa Cruz, CA, USA", "aff_domain": "ucsc.edu;ucsc.edu;ucsc.edu", "email": "ucsc.edu;ucsc.edu;ucsc.edu", "github": "github.com/UCSC-REAL/SimiFeat", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhu22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Santa Cruz", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.ucsc.edu", "aff_unique_abbr": "UCSC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Santa Cruz", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Dialog Inpainting: Turning Documents into Dialogs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17589", "id": "17589", "proceeding": "https://proceedings.mlr.press/v162/dai22a.html", "poster": "/media/PosterPDFs/ICML%202022/d7aab42e6b85c49c0f1d3a115e939c74_SXB6bCV.png?t=1658087873.9124937", "slides": "", "author_site": "Zhuyun Dai, Arun Tejasvi Chaganty, Vincent Zhao, Aida Amini, Qazi Mamunur Rashid, Mike Green, Kelvin Guu", "author": "Zhuyun Dai; Arun Tejasvi Chaganty; Vincent Y Zhao; Aida Amini; Qazi Mamunur Rashid; Mike Green; Kelvin Guu", "abstract": "Many important questions (e.g. \"How to eat healthier?\") require conversation to establish context and explore in depth. However, conversational question answering (ConvQA) systems have long been stymied by scarce training data that is expensive to collect. To address this problem, we propose a new technique for synthetically generating diverse and high-quality dialog data: dialog inpainting. Our approach takes the text of any document and transforms it into a two-person dialog between the writer and an imagined reader: we treat sentences from the article as utterances spoken by the writer, and then use a dialog inpainter to predict what the imagined reader asked or said in between each of the writer\u2019s utterances. By applying this approach to passages from Wikipedia and the web, we produce WikiDialog and WebDialog, two datasets totalling 19 million diverse information-seeking dialogs \u2013 1,000x larger than the largest existing ConvQA dataset. Furthermore, human raters judge the answer adequacy and conversationality of WikiDialog to be as good or better than existing manually-collected datasets. Remarkably, our approach shows strong zero-shot capability, generating high quality synthetic data without using any in-domain ConvQA data. Using our inpainted data to pre-train ConvQA retrieval systems, we significantly advance state-of-the-art across three benchmarks (QReCC, OR-QuAC, TREC CAsT) yielding up to 40% relative gains on standard evaluation metrics.", "bibtex": "@InProceedings{pmlr-v162-dai22a,\n title = \t {Dialog Inpainting: Turning Documents into Dialogs},\n author = {Dai, Zhuyun and Chaganty, Arun Tejasvi and Zhao, Vincent Y and Amini, Aida and Rashid, Qazi Mamunur and Green, Mike and Guu, Kelvin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4558--4586},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dai22a/dai22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dai22a.html},\n abstract = \t {Many important questions (e.g. \"How to eat healthier?\") require conversation to establish context and explore in depth. However, conversational question answering (ConvQA) systems have long been stymied by scarce training data that is expensive to collect. To address this problem, we propose a new technique for synthetically generating diverse and high-quality dialog data: dialog inpainting. Our approach takes the text of any document and transforms it into a two-person dialog between the writer and an imagined reader: we treat sentences from the article as utterances spoken by the writer, and then use a dialog inpainter to predict what the imagined reader asked or said in between each of the writer\u2019s utterances. By applying this approach to passages from Wikipedia and the web, we produce WikiDialog and WebDialog, two datasets totalling 19 million diverse information-seeking dialogs \u2013 1,000x larger than the largest existing ConvQA dataset. Furthermore, human raters judge the answer adequacy and conversationality of WikiDialog to be as good or better than existing manually-collected datasets. Remarkably, our approach shows strong zero-shot capability, generating high quality synthetic data without using any in-domain ConvQA data. Using our inpainted data to pre-train ConvQA retrieval systems, we significantly advance state-of-the-art across three benchmarks (QReCC, OR-QuAC, TREC CAsT) yielding up to 40% relative gains on standard evaluation metrics.}\n}", "pdf": "https://proceedings.mlr.press/v162/dai22a/dai22a.pdf", "supp": "", "pdf_size": 1391146, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13888132119591432248&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Google Inc., Mountain View, USA; Google Inc., Mountain View, USA; Google Inc., Mountain View, USA; Google Inc., Mountain View, USA; Google Inc., Mountain View, USA; Google Inc., Mountain View, USA; Google Inc., Mountain View, USA", "aff_domain": "google.com;google.com;google.com;google.com;google.com;google.com;google.com", "email": "google.com;google.com;google.com;google.com;google.com;google.com;google.com", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/dai22a.html", "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Inc.", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Difference Advantage Estimation for Multi-Agent Policy Gradients", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16125", "id": "16125", "proceeding": "https://proceedings.mlr.press/v162/li22w.html", "poster": "/media/PosterPDFs/ICML%202022/f005e17eabbb0d38b06b8a78f3637d85.png?t=1657507981.6572158", "slides": "", "author_site": "yueheng li, Guangming Xie, Zongqing Lu", "author": "Yueheng Li; Guangming Xie; Zongqing Lu", "abstract": "Multi-agent policy gradient methods in centralized training with decentralized execution recently witnessed many progresses. During centralized training, multi-agent credit assignment is crucial, which can substantially promote learning performance. However, explicit multi-agent credit assignment in multi-agent policy gradient methods still receives less attention. In this paper, we investigate multi-agent credit assignment induced by reward shaping and provide a theoretical understanding in terms of its credit assignment and policy bias. Based on this, we propose an exponentially weighted advantage estimator, which is analogous to GAE, to enable multi-agent credit assignment while allowing the tradeoff with policy bias. Empirical results show that our approach can successfully perform effective multi-agent credit assignment, and thus substantially outperforms other advantage estimators.", "bibtex": "@InProceedings{pmlr-v162-li22w,\n title = \t {Difference Advantage Estimation for Multi-Agent Policy Gradients},\n author = {Li, Yueheng and Xie, Guangming and Lu, Zongqing},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13066--13085},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22w/li22w.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22w.html},\n abstract = \t {Multi-agent policy gradient methods in centralized training with decentralized execution recently witnessed many progresses. During centralized training, multi-agent credit assignment is crucial, which can substantially promote learning performance. However, explicit multi-agent credit assignment in multi-agent policy gradient methods still receives less attention. In this paper, we investigate multi-agent credit assignment induced by reward shaping and provide a theoretical understanding in terms of its credit assignment and policy bias. Based on this, we propose an exponentially weighted advantage estimator, which is analogous to GAE, to enable multi-agent credit assignment while allowing the tradeoff with policy bias. Empirical results show that our approach can successfully perform effective multi-agent credit assignment, and thus substantially outperforms other advantage estimators.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22w/li22w.pdf", "supp": "", "pdf_size": 2697093, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17772271971366849329&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": ";;", "aff_domain": ";;", "email": ";;", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/li22w.html" }, { "title": "Differentiable Top-k Classification Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16463", "id": "16463", "proceeding": "https://proceedings.mlr.press/v162/petersen22a.html", "poster": "/media/PosterPDFs/ICML%202022/bd4d08cd70f4be1982372107b3b448ef.png?t=1657685074.3057377", "slides": "/media/icml-2022/Slides/16463.pdf", "author_site": "Felix Petersen, Hilde Kuehne, Christian Borgelt, Oliver Deussen", "author": "Felix Petersen; Hilde Kuehne; Christian Borgelt; Oliver Deussen", "abstract": "The top-k classification accuracy is one of the core metrics in machine learning. Here, k is conventionally a positive integer, such as 1 or 5, leading to top-1 or top-5 training objectives. In this work, we relax this assumption and optimize the model for multiple k simultaneously instead of using a single k. Leveraging recent advances in differentiable sorting and ranking, we propose a family of differentiable top-k cross-entropy classification losses. This allows training while not only considering the top-1 prediction, but also, e.g., the top-2 and top-5 predictions. We evaluate the proposed losses for fine-tuning on state-of-the-art architectures, as well as for training from scratch. We find that relaxing k not only produces better top-5 accuracies, but also leads to top-1 accuracy improvements. When fine-tuning publicly available ImageNet models, we achieve a new state-of-the-art for these models.", "bibtex": "@InProceedings{pmlr-v162-petersen22a,\n title = \t {Differentiable Top-k Classification Learning},\n author = {Petersen, Felix and Kuehne, Hilde and Borgelt, Christian and Deussen, Oliver},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17656--17668},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/petersen22a/petersen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/petersen22a.html},\n abstract = \t {The top-k classification accuracy is one of the core metrics in machine learning. Here, k is conventionally a positive integer, such as 1 or 5, leading to top-1 or top-5 training objectives. In this work, we relax this assumption and optimize the model for multiple k simultaneously instead of using a single k. Leveraging recent advances in differentiable sorting and ranking, we propose a family of differentiable top-k cross-entropy classification losses. This allows training while not only considering the top-1 prediction, but also, e.g., the top-2 and top-5 predictions. We evaluate the proposed losses for fine-tuning on state-of-the-art architectures, as well as for training from scratch. We find that relaxing k not only produces better top-5 accuracies, but also leads to top-1 accuracy improvements. When fine-tuning publicly available ImageNet models, we achieve a new state-of-the-art for these models.}\n}", "pdf": "https://proceedings.mlr.press/v162/petersen22a/petersen22a.pdf", "supp": "", "pdf_size": 637120, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2888939572667326983&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "University of Konstanz; University of Frankfurt + MIT-IBM Watson AI Lab; University of Salzburg; University of Konstanz", "aff_domain": "uni.kn; ; ; ", "email": "uni.kn; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/petersen22a.html", "aff_unique_index": "0;1+2;3;0", "aff_unique_norm": "University of Konstanz;Goethe University Frankfurt;Massachusetts Institute of Technology;University of Salzburg", "aff_unique_dep": ";;IBM Watson AI Lab;", "aff_unique_url": "https://www.uni-konstanz.de;https://www.uni-frankfurt.de;https://www.mitibmwatsonailab.org;https://www.uni-salzburg.at", "aff_unique_abbr": "Uni Konstanz;Goethe Uni;MIT-IBM AI Lab;USAL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;2;0", "aff_country_unique": "Germany;United States;Austria" }, { "title": "Differentially Private Approximate Quantiles", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15973", "id": "15973", "proceeding": "https://proceedings.mlr.press/v162/kaplan22a.html", "poster": "/media/PosterPDFs/ICML%202022/f29b38f160f87ae86df31cee1982066f_cIXVwTj.png?t=1657200162.634761", "slides": "", "author_site": "Haim Kaplan, Shachar Schnapp, Uri Stemmer", "author": "Haim Kaplan; Shachar Schnapp; Uri Stemmer", "abstract": "In this work we study the problem of differentially private (DP) quantiles, in which given dataset $X$ and quantiles $q_1, ..., q_m \\in [0,1]$, we want to output $m$ quantile estimations which are as close as possible to the true quantiles and preserve DP. We describe a simple recursive DP algorithm, which we call Approximate Quantiles (AQ), for this task. We give a worst case upper bound on its error, and show that its error is much lower than of previous implementations on several different datasets. Furthermore, it gets this low error while running time two orders of magnitude faster that the best previous implementation.", "bibtex": "@InProceedings{pmlr-v162-kaplan22a,\n title = \t {Differentially Private Approximate Quantiles},\n author = {Kaplan, Haim and Schnapp, Shachar and Stemmer, Uri},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10751--10761},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kaplan22a/kaplan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kaplan22a.html},\n abstract = \t {In this work we study the problem of differentially private (DP) quantiles, in which given dataset $X$ and quantiles $q_1, ..., q_m \\in [0,1]$, we want to output $m$ quantile estimations which are as close as possible to the true quantiles and preserve DP. We describe a simple recursive DP algorithm, which we call Approximate Quantiles (AQ), for this task. We give a worst case upper bound on its error, and show that its error is much lower than of previous implementations on several different datasets. Furthermore, it gets this low error while running time two orders of magnitude faster that the best previous implementation.}\n}", "pdf": "https://proceedings.mlr.press/v162/kaplan22a/kaplan22a.pdf", "supp": "", "pdf_size": 2363349, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7634955190551364494&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Tel Aviv University+Google Research; Ben-Gurion University; Tel Aviv University+Google Research", "aff_domain": "tau.ac.il;post.bgu.ac.il;tau.ac.il", "email": "tau.ac.il;post.bgu.ac.il;tau.ac.il", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kaplan22a.html", "aff_unique_index": "0+1;2;0+1", "aff_unique_norm": "Tel Aviv University;Google;Ben-Gurion University of the Negev", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.tau.ac.il;https://research.google;https://www.bgu.ac.il", "aff_unique_abbr": "TAU;Google Research;BGU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+1;0;0+1", "aff_country_unique": "Israel;United States" }, { "title": "Differentially Private Community Detection for Stochastic Block Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16779", "id": "16779", "proceeding": "https://proceedings.mlr.press/v162/mohamed22a.html", "poster": "", "slides": "/media/icml-2022/Slides/16779.pdf", "author_site": "Mohamed Mohamed, Dung Nguyen, Anil Vullikanti, Ravi Tandon", "author": "Mohamed S Mohamed; Dung Nguyen; Anil Vullikanti; Ravi Tandon", "abstract": "The goal of community detection over graphs is to recover underlying labels/attributes of users (e.g., political affiliation) given the connectivity between users. There has been significant recent progress on understanding the fundamental limits of community detection when the graph is generated from a stochastic block model (SBM). Specifically, sharp information theoretic limits and efficient algorithms have been obtained for SBMs as a function of $p$ and $q$, which represent the intra-community and inter-community connection probabilities. In this paper, we study the community detection problem while preserving the privacy of the individual connections between the vertices. Focusing on the notion of $(\\epsilon, \\delta)$-edge differential privacy (DP), we seek to understand the fundamental tradeoffs between $(p, q)$, DP budget $(\\epsilon, \\delta)$, and computational efficiency for exact recovery of community labels. To this end, we present and analyze the associated information-theoretic tradeoffs for three differentially private community recovery mechanisms: a) stability based mechanism; b) sampling based mechanisms; and c) graph perturbation mechanisms. Our main findings are that stability and sampling based mechanisms lead to a superior tradeoff between $(p,q)$ and the privacy budget $(\\epsilon, \\delta)$; however this comes at the expense of higher computational complexity. On the other hand, albeit low complexity, graph perturbation mechanisms require the privacy budget $\\epsilon$ to scale as $\\Omega(\\log(n))$ for exact recovery.", "bibtex": "@InProceedings{pmlr-v162-mohamed22a,\n title = \t {Differentially Private Community Detection for Stochastic Block Models},\n author = {Mohamed, Mohamed S and Nguyen, Dung and Vullikanti, Anil and Tandon, Ravi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15858--15894},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mohamed22a/mohamed22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mohamed22a.html},\n abstract = \t {The goal of community detection over graphs is to recover underlying labels/attributes of users (e.g., political affiliation) given the connectivity between users. There has been significant recent progress on understanding the fundamental limits of community detection when the graph is generated from a stochastic block model (SBM). Specifically, sharp information theoretic limits and efficient algorithms have been obtained for SBMs as a function of $p$ and $q$, which represent the intra-community and inter-community connection probabilities. In this paper, we study the community detection problem while preserving the privacy of the individual connections between the vertices. Focusing on the notion of $(\\epsilon, \\delta)$-edge differential privacy (DP), we seek to understand the fundamental tradeoffs between $(p, q)$, DP budget $(\\epsilon, \\delta)$, and computational efficiency for exact recovery of community labels. To this end, we present and analyze the associated information-theoretic tradeoffs for three differentially private community recovery mechanisms: a) stability based mechanism; b) sampling based mechanisms; and c) graph perturbation mechanisms. Our main findings are that stability and sampling based mechanisms lead to a superior tradeoff between $(p,q)$ and the privacy budget $(\\epsilon, \\delta)$; however this comes at the expense of higher computational complexity. On the other hand, albeit low complexity, graph perturbation mechanisms require the privacy budget $\\epsilon$ to scale as $\\Omega(\\log(n))$ for exact recovery.}\n}", "pdf": "https://proceedings.mlr.press/v162/mohamed22a/mohamed22a.pdf", "supp": "", "pdf_size": 1031938, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3895925412264029816&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Electrical and Computer Engineering, University of Arizona + Biocomplexity Institute and Initiative, University of Virginia + Department of Computer Science, University of Virginia; Biocomplexity Institute and Initiative, University of Virginia + Department of Computer Science, University of Virginia; Biocomplexity Institute and Initiative, University of Virginia + Department of Computer Science, University of Virginia; Department of Electrical and Computer Engineering, University of Arizona", "aff_domain": "email.arizona.edu;virginia.edu; ;email.arizona.edu", "email": "email.arizona.edu;virginia.edu; ;email.arizona.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/mohamed22a.html", "aff_unique_index": "0+1+1;1+1;1+1;0", "aff_unique_norm": "University of Arizona;University of Virginia", "aff_unique_dep": "Department of Electrical and Computer Engineering;Biocomplexity Institute and Initiative", "aff_unique_url": "https://www.arizona.edu;https://www.virginia.edu", "aff_unique_abbr": "UArizona;UVA", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Differentially Private Coordinate Descent for Composite Empirical Risk Minimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18089", "id": "18089", "proceeding": "https://proceedings.mlr.press/v162/mangold22a.html", "poster": "/media/PosterPDFs/ICML%202022/a1d7311f2a312426d710e1c617fcbc8c.png?t=1657905249.2422006", "slides": "", "author_site": "Paul Mangold, Aur\u00e9lien Bellet, Joseph Salmon, Marc Tommasi", "author": "Paul Mangold; Aur\u00e9lien Bellet; Joseph Salmon; Marc Tommasi", "abstract": "Machine learning models can leak information about the data used to train them. To mitigate this issue, Differentially Private (DP) variants of optimization algorithms like Stochastic Gradient Descent (DP-SGD) have been designed to trade-off utility for privacy in Empirical Risk Minimization (ERM) problems. In this paper, we propose Differentially Private proximal Coordinate Descent (DP-CD), a new method to solve composite DP-ERM problems. We derive utility guarantees through a novel theoretical analysis of inexact coordinate descent. Our results show that, thanks to larger step sizes, DP-CD can exploit imbalance in gradient coordinates to outperform DP-SGD. We also prove new lower bounds for composite DP-ERM under coordinate-wise regularity assumptions, that are nearly matched by DP-CD. For practical implementations, we propose to clip gradients using coordinate-wise thresholds that emerge from our theory, avoiding costly hyperparameter tuning. Experiments on real and synthetic data support our results, and show that DP-CD compares favorably with DP-SGD.", "bibtex": "@InProceedings{pmlr-v162-mangold22a,\n title = \t {Differentially Private Coordinate Descent for Composite Empirical Risk Minimization},\n author = {Mangold, Paul and Bellet, Aur{\\'e}lien and Salmon, Joseph and Tommasi, Marc},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14948--14978},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mangold22a/mangold22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mangold22a.html},\n abstract = \t {Machine learning models can leak information about the data used to train them. To mitigate this issue, Differentially Private (DP) variants of optimization algorithms like Stochastic Gradient Descent (DP-SGD) have been designed to trade-off utility for privacy in Empirical Risk Minimization (ERM) problems. In this paper, we propose Differentially Private proximal Coordinate Descent (DP-CD), a new method to solve composite DP-ERM problems. We derive utility guarantees through a novel theoretical analysis of inexact coordinate descent. Our results show that, thanks to larger step sizes, DP-CD can exploit imbalance in gradient coordinates to outperform DP-SGD. We also prove new lower bounds for composite DP-ERM under coordinate-wise regularity assumptions, that are nearly matched by DP-CD. For practical implementations, we propose to clip gradients using coordinate-wise thresholds that emerge from our theory, avoiding costly hyperparameter tuning. Experiments on real and synthetic data support our results, and show that DP-CD compares favorably with DP-SGD.}\n}", "pdf": "https://proceedings.mlr.press/v162/mangold22a/mangold22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/mangold22a-supp.zip", "pdf_size": 593138, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1987671907632467081&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 23, "aff": "Univ. Lille, Inria, CNRS, Centrale Lille, UMR 9189 - CRIStAL, F-59000 Lille, France; Univ. Lille, Inria, CNRS, Centrale Lille, UMR 9189 - CRIStAL, F-59000 Lille, France; IMAG, Univ Montpellier, CNRS, Montpellier, France + Institut Universitaire de France (IUF); Univ. Lille, CNRS, Inria, Centrale Lille, UMR 9189 - CRIStAL, F-59000 Lille, France", "aff_domain": "inria.fr; ; ;", "email": "inria.fr; ; ;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/mangold22a.html", "aff_unique_index": "0;0;1+2;0", "aff_unique_norm": "University of Lille;University of Montpellier;Institut Universitaire de France", "aff_unique_dep": "UMR 9189 - CRIStAL;IMAG;", "aff_unique_url": "https://www.univ-lille.fr;https://www.univ-montp1.fr;https://www.iuf.cnrs.fr", "aff_unique_abbr": "Univ. Lille;Univ Montpellier;IUF", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Lille;Montpellier;", "aff_country_unique_index": "0;0;0+0;0", "aff_country_unique": "France" }, { "title": "Differentially Private Maximal Information Coefficients", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16619", "id": "16619", "proceeding": "https://proceedings.mlr.press/v162/lazarsfeld22a.html", "poster": "/media/PosterPDFs/ICML%202022/9570efef719d705326f0ff817ef084e6.png?t=1658070606.4434206", "slides": "", "author_site": "John Lazarsfeld, Aaron Johnson, Emmanuel Adeniran", "author": "John Lazarsfeld; Aaron Johnson; Emmanuel Adeniran", "abstract": "The Maximal Information Coefficient (MIC) is a powerful statistic to identify dependencies between variables. However, it may be applied to sensitive data, and publishing it could leak private information. As a solution, we present algorithms to approximate MIC in a way that provides differential privacy. We show that the natural application of the classic Laplace mechanism yields insufficient accuracy. We therefore introduce the MICr statistic, which is a new MIC approximation that is more compatible with differential privacy. We prove MICr is a consistent estimator for MIC, and we provide two differentially private versions of it. We perform experiments on a variety of real and synthetic datasets. The results show that the private MICr statistics significantly outperform direct application of the Laplace mechanism. Moreover, experiments on real-world datasets show accuracy that is usable when the sample size is at least moderately large.", "bibtex": "@InProceedings{pmlr-v162-lazarsfeld22a,\n title = \t {Differentially Private Maximal Information Coefficients},\n author = {Lazarsfeld, John and Johnson, Aaron and Adeniran, Emmanuel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12126--12163},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lazarsfeld22a/lazarsfeld22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lazarsfeld22a.html},\n abstract = \t {The Maximal Information Coefficient (MIC) is a powerful statistic to identify dependencies between variables. However, it may be applied to sensitive data, and publishing it could leak private information. As a solution, we present algorithms to approximate MIC in a way that provides differential privacy. We show that the natural application of the classic Laplace mechanism yields insufficient accuracy. We therefore introduce the MICr statistic, which is a new MIC approximation that is more compatible with differential privacy. We prove MICr is a consistent estimator for MIC, and we provide two differentially private versions of it. We perform experiments on a variety of real and synthetic datasets. The results show that the private MICr statistics significantly outperform direct application of the Laplace mechanism. Moreover, experiments on real-world datasets show accuracy that is usable when the sample size is at least moderately large.}\n}", "pdf": "https://proceedings.mlr.press/v162/lazarsfeld22a/lazarsfeld22a.pdf", "supp": "", "pdf_size": 1513595, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14074773669133605205&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Yale University; U.S. Naval Research Laboratory; Department of Computer Science, Yale University", "aff_domain": "yale.edu; ; ", "email": "yale.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/lazarsfeld22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Yale University;Naval Research Laboratory", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.yale.edu;https://www.nrl.navy.mil", "aff_unique_abbr": "Yale;NRL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Diffusion Models for Adversarial Purification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16707", "id": "16707", "proceeding": "https://proceedings.mlr.press/v162/nie22a.html", "poster": "/media/PosterPDFs/ICML%202022/e32c51ad39723ee92b285b362c916ca7.png?t=1657645745.1428213", "slides": "/media/icml-2022/Slides/16707.pdf", "author_site": "Weili Nie, Brandon Guo, Yujia Huang, Chaowei Xiao, Arash Vahdat, Animashree Anandkumar", "author": "Weili Nie; Brandon Guo; Yujia Huang; Chaowei Xiao; Arash Vahdat; Animashree Anandkumar", "abstract": "Adversarial purification refers to a class of defense methods that remove adversarial perturbations using a generative model. These methods do not make assumptions on the form of attack and the classification model, and thus can defend pre-existing classifiers against unseen threats. However, their performance currently falls behind adversarial training methods. In this work, we propose DiffPure that uses diffusion models for adversarial purification: Given an adversarial example, we first diffuse it with a small amount of noise following a forward diffusion process, and then recover the clean image through a reverse generative process. To evaluate our method against strong adaptive attacks in an efficient and scalable way, we propose to use the adjoint method to compute full gradients of the reverse generative process. Extensive experiments on three image datasets including CIFAR-10, ImageNet and CelebA-HQ with three classifier architectures including ResNet, WideResNet and ViT demonstrate that our method achieves the state-of-the-art results, outperforming current adversarial training and adversarial purification methods, often by a large margin.", "bibtex": "@InProceedings{pmlr-v162-nie22a,\n title = \t {Diffusion Models for Adversarial Purification},\n author = {Nie, Weili and Guo, Brandon and Huang, Yujia and Xiao, Chaowei and Vahdat, Arash and Anandkumar, Animashree},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16805--16827},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nie22a/nie22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nie22a.html},\n abstract = \t {Adversarial purification refers to a class of defense methods that remove adversarial perturbations using a generative model. These methods do not make assumptions on the form of attack and the classification model, and thus can defend pre-existing classifiers against unseen threats. However, their performance currently falls behind adversarial training methods. In this work, we propose DiffPure that uses diffusion models for adversarial purification: Given an adversarial example, we first diffuse it with a small amount of noise following a forward diffusion process, and then recover the clean image through a reverse generative process. To evaluate our method against strong adaptive attacks in an efficient and scalable way, we propose to use the adjoint method to compute full gradients of the reverse generative process. Extensive experiments on three image datasets including CIFAR-10, ImageNet and CelebA-HQ with three classifier architectures including ResNet, WideResNet and ViT demonstrate that our method achieves the state-of-the-art results, outperforming current adversarial training and adversarial purification methods, often by a large margin.}\n}", "pdf": "https://proceedings.mlr.press/v162/nie22a/nie22a.pdf", "supp": "", "pdf_size": 12692196, "gs_citation": 583, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9166244005732160404&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "NVIDIA; Caltech; Caltech; NVIDIA+ASU; NVIDIA; Caltech", "aff_domain": "nvidia.com; ; ; ;nvidia.com; ", "email": "nvidia.com; ; ; ;nvidia.com; ", "github": "", "project": "https://diffpure.github.io", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/nie22a.html", "aff_unique_index": "0;1;1;0+2;0;1", "aff_unique_norm": "NVIDIA;California Institute of Technology;Arizona State University", "aff_unique_dep": "NVIDIA Corporation;;", "aff_unique_url": "https://www.nvidia.com;https://www.caltech.edu;https://www.asu.edu", "aff_unique_abbr": "NVIDIA;Caltech;ASU", "aff_campus_unique_index": "1;1;;1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0;0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "Diffusion bridges vector quantized variational autoencoders", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17283", "id": "17283", "proceeding": "https://proceedings.mlr.press/v162/cohen22b.html", "poster": "/media/PosterPDFs/ICML%202022/49e863b146f3b5470ee222ee84669b1c_g74WfPp.png?t=1657822154.4382062", "slides": "", "author_site": "Max Cohen, Guillaume QUISPE, Sylvain Le Corff, Charles Ollion, Eric Moulines", "author": "Max Cohen; Guillaume Quispe; Sylvain Le Corff; Charles Ollion; Eric Moulines", "abstract": "Vector Quantized-Variational AutoEncoders (VQ-VAE) are generative models based on discrete latent representations of the data, where inputs are mapped to a finite set of learned embeddings. To generate new samples, an autoregressive prior distribution over the discrete states must be trained separately. This prior is generally very complex and leads to slow generation. In this work, we propose a new model to train the prior and the encoder/decoder networks simultaneously. We build a diffusion bridge between a continuous coded vector and a non-informative prior distribution. The latent discrete states are then given as random functions of these continuous vectors. We show that our model is competitive with the autoregressive prior on the mini-Imagenet and CIFAR dataset and is efficient in both optimization and sampling. Our framework also extends the standard VQ-VAE and enables end-to-end training.", "bibtex": "@InProceedings{pmlr-v162-cohen22b,\n title = \t {Diffusion bridges vector quantized variational autoencoders},\n author = {Cohen, Max and Quispe, Guillaume and Corff, Sylvain Le and Ollion, Charles and Moulines, Eric},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4141--4156},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cohen22b/cohen22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/cohen22b.html},\n abstract = \t {Vector Quantized-Variational AutoEncoders (VQ-VAE) are generative models based on discrete latent representations of the data, where inputs are mapped to a finite set of learned embeddings. To generate new samples, an autoregressive prior distribution over the discrete states must be trained separately. This prior is generally very complex and leads to slow generation. In this work, we propose a new model to train the prior and the encoder/decoder networks simultaneously. We build a diffusion bridge between a continuous coded vector and a non-informative prior distribution. The latent discrete states are then given as random functions of these continuous vectors. We show that our model is competitive with the autoregressive prior on the mini-Imagenet and CIFAR dataset and is efficient in both optimization and sampling. Our framework also extends the standard VQ-VAE and enables end-to-end training.}\n}", "pdf": "https://proceedings.mlr.press/v162/cohen22b/cohen22b.pdf", "supp": "", "pdf_size": 10816200, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15768272528622480760&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Samovar, T\u00b4el\u00b4ecom SudParis, D\u00b4epartement CITI, Institut Polytechnique de Paris, Palaiseau, France+Oze\u00b4Energies, Charenton-Le-Pont, France; Centre de Math\u00b4ematiques Appliqu\u00b4ees, \u00b4Ecole polytechnique, Institut Polytechnique de Paris, Palaiseau, France; Samovar, T\u00b4el\u00b4ecom SudParis, D\u00b4epartement CITI, Institut Polytechnique de Paris, Palaiseau, France; Centre de Math\u00b4ematiques Appliqu\u00b4ees, \u00b4Ecole polytechnique, Institut Polytechnique de Paris, Palaiseau, France; Centre de Math\u00b4ematiques Appliqu\u00b4ees, \u00b4Ecole polytechnique, Institut Polytechnique de Paris, Palaiseau, France", "aff_domain": "telecom-sudparis.eu; ; ; ; ", "email": "telecom-sudparis.eu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/cohen22b.html", "aff_unique_index": "0+1;2;0;2;2", "aff_unique_norm": "T\u00b4el\u00b4ecom SudParis;Oze Energies;Ecole Polytechnique", "aff_unique_dep": "D\u00b4epartement CITI;;Centre de Mathematiques Appliquees", "aff_unique_url": "https://www.telecom-sudparis.eu;;https://www.polytechnique.edu", "aff_unique_abbr": "TSP;;Ecole polytechnique", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Palaiseau;", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "France" }, { "title": "Dimension-free Complexity Bounds for High-order Nonconvex Finite-sum Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17483", "id": "17483", "proceeding": "https://proceedings.mlr.press/v162/zhou22a.html", "poster": "/media/PosterPDFs/ICML%202022/fc03d48253286a798f5116ec00e99b2b.png?t=1658050777.289536", "slides": "", "author_site": "Dongruo Zhou, Quanquan Gu", "author": "Dongruo Zhou; Quanquan Gu", "abstract": "Stochastic high-order methods for finding first-order stationary points in nonconvex finite-sum optimization have witnessed increasing interest in recent years, and various upper and lower bounds of the oracle complexity have been proved. However, under standard regularity assumptions, existing complexity bounds are all dimension-dependent (e.g., polylogarithmic dependence), which contrasts with the dimension-free complexity bounds for stochastic first-order methods and deterministic high-order methods. In this paper, we show that the polylogarithmic dimension dependence gap is not essential and can be closed. More specifically, we propose stochastic high-order algorithms with novel first-order and high-order derivative estimators, which can achieve dimension-free complexity bounds. With the access to $p$-th order derivatives of the objective function, we prove that our algorithm finds $\\epsilon$-stationary points with $O(n^{(2p-1)/(2p)}/\\epsilon^{(p+1)/p})$ high-order oracle complexities, where $n$ is the number of individual functions. Our result strictly improves the complexity bounds of existing high-order deterministic methods with respect to the dependence on $n$, and it is dimension-free compared with existing stochastic high-order methods.", "bibtex": "@InProceedings{pmlr-v162-zhou22a,\n title = \t {Dimension-free Complexity Bounds for High-order Nonconvex Finite-sum Optimization},\n author = {Zhou, Dongruo and Gu, Quanquan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27143--27158},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22a/zhou22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22a.html},\n abstract = \t {Stochastic high-order methods for finding first-order stationary points in nonconvex finite-sum optimization have witnessed increasing interest in recent years, and various upper and lower bounds of the oracle complexity have been proved. However, under standard regularity assumptions, existing complexity bounds are all dimension-dependent (e.g., polylogarithmic dependence), which contrasts with the dimension-free complexity bounds for stochastic first-order methods and deterministic high-order methods. In this paper, we show that the polylogarithmic dimension dependence gap is not essential and can be closed. More specifically, we propose stochastic high-order algorithms with novel first-order and high-order derivative estimators, which can achieve dimension-free complexity bounds. With the access to $p$-th order derivatives of the objective function, we prove that our algorithm finds $\\epsilon$-stationary points with $O(n^{(2p-1)/(2p)}/\\epsilon^{(p+1)/p})$ high-order oracle complexities, where $n$ is the number of individual functions. Our result strictly improves the complexity bounds of existing high-order deterministic methods with respect to the dependence on $n$, and it is dimension-free compared with existing stochastic high-order methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22a/zhou22a.pdf", "supp": "", "pdf_size": 398115, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16042782334862589289&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, University of California, Los Angeles, CA 90095, USA; Department of Computer Science, University of California, Los Angeles, CA 90095, USA", "aff_domain": "cs.ucla.edu;cs.ucla.edu", "email": "cs.ucla.edu;cs.ucla.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/zhou22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Direct Behavior Specification via Constrained Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16263", "id": "16263", "proceeding": "https://proceedings.mlr.press/v162/roy22a.html", "poster": "/media/PosterPDFs/ICML%202022/894a9b94bcc5969b60bd18e8ea9c0ddc_xGtDFfL.png?t=1657229562.2874458", "slides": "", "author_site": "Julien Roy, Roger Girgis, Joshua Romoff, Pierre-Luc Bacon, Christopher Pal", "author": "Julien Roy; Roger Girgis; Joshua Romoff; Pierre-Luc Bacon; Chris J Pal", "abstract": "The standard formulation of Reinforcement Learning lacks a practical way of specifying what are admissible and forbidden behaviors. Most often, practitioners go about the task of behavior specification by manually engineering the reward function, a counter-intuitive process that requires several iterations and is prone to reward hacking by the agent. In this work, we argue that constrained RL, which has almost exclusively been used for safe RL, also has the potential to significantly reduce the amount of work spent for reward specification in applied RL projects. To this end, we propose to specify behavioral preferences in the CMDP framework and to use Lagrangian methods to automatically weigh each of these behavioral constraints. Specifically, we investigate how CMDPs can be adapted to solve goal-based tasks while adhering to several constraints simultaneously. We evaluate this framework on a set of continuous control tasks relevant to the application of Reinforcement Learning for NPC design in video games.", "bibtex": "@InProceedings{pmlr-v162-roy22a,\n title = \t {Direct Behavior Specification via Constrained Reinforcement Learning},\n author = {Roy, Julien and Girgis, Roger and Romoff, Joshua and Bacon, Pierre-Luc and Pal, Chris J},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18828--18843},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/roy22a/roy22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/roy22a.html},\n abstract = \t {The standard formulation of Reinforcement Learning lacks a practical way of specifying what are admissible and forbidden behaviors. Most often, practitioners go about the task of behavior specification by manually engineering the reward function, a counter-intuitive process that requires several iterations and is prone to reward hacking by the agent. In this work, we argue that constrained RL, which has almost exclusively been used for safe RL, also has the potential to significantly reduce the amount of work spent for reward specification in applied RL projects. To this end, we propose to specify behavioral preferences in the CMDP framework and to use Lagrangian methods to automatically weigh each of these behavioral constraints. Specifically, we investigate how CMDPs can be adapted to solve goal-based tasks while adhering to several constraints simultaneously. We evaluate this framework on a set of continuous control tasks relevant to the application of Reinforcement Learning for NPC design in video games.}\n}", "pdf": "https://proceedings.mlr.press/v162/roy22a/roy22a.pdf", "supp": "", "pdf_size": 8637287, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12930072295285422644&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Institut d\u2019intelligence aritficielle du Qu \u00b4ebec (Mila)+\u00b4Ecole Polytechnique de Montr \u00b4eal+Ubisoft La Forge; Institut d\u2019intelligence aritficielle du Qu \u00b4ebec (Mila)+\u00b4Ecole Polytechnique de Montr \u00b4eal+Ubisoft La Forge; Ubisoft La Forge; Institut d\u2019intelligence aritficielle du Qu \u00b4ebec (Mila)+\u00b4Ecole Polytechnique de Montr \u00b4eal+Universit \u00b4e de Montr \u00b4eal+Facebook CIFAR AI Chair+ServiceNow+Canada CIFAR AI Chair; Institut d\u2019intelligence aritficielle du Qu \u00b4ebec (Mila)+\u00b4Ecole Polytechnique de Montr \u00b4eal+Universit \u00b4e de Montr \u00b4eal+Facebook CIFAR AI Chair+ServiceNow+Canada CIFAR AI Chair", "aff_domain": "mila.quebec; ; ; ; ", "email": "mila.quebec; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/roy22a.html", "aff_unique_index": "0+1+2;0+1+2;2;0+1+3+4+5+6;0+1+3+4+5+6", "aff_unique_norm": "Institut d'intelligence artificielle du Quebec;Ecole Polytechnique de Montr\u00e9al;Ubisoft;Universit\u00e9 de Montr\u00e9al;Meta;ServiceNow;Canadian Institute for Advanced Research", "aff_unique_dep": "Mila;;La Forge;;Facebook CIFAR AI;;AI Chair", "aff_unique_url": "https://mila.quebec;https://www.polymtl.ca;https://www.ubisoft.com;https://www.umontreal.ca;https://www.facebook.com;https://www.servicenow.com;https://www.cifar.ca", "aff_unique_abbr": "Mila;Polytechnique Montr\u00e9al;Ubisoft;UdeM;FB;ServiceNow;CIFAR", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Montr\u00e9al", "aff_country_unique_index": "0+0+1;0+0+1;1;0+0+0+2+2+0;0+0+0+2+2+0", "aff_country_unique": "Canada;France;United States" }, { "title": "Directed Acyclic Transformer for Non-Autoregressive Machine Translation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17957", "id": "17957", "proceeding": "https://proceedings.mlr.press/v162/huang22m.html", "poster": "/media/PosterPDFs/ICML%202022/a6b964c0bb675116a15ef1325b01ff45.png?t=1656759752.5720599", "slides": "/media/icml-2022/Slides/17957.pdf", "author_site": "Fei Huang, Hao Zhou, Yang Liu, Hang Li, Minlie Huang", "author": "Fei Huang; Hao Zhou; Yang Liu; Hang Li; Minlie Huang", "abstract": "Non-autoregressive Transformers (NATs) significantly reduce the decoding latency by generating all tokens in parallel. However, such independent predictions prevent NATs from capturing the dependencies between the tokens for generating multiple possible translations. In this paper, we propose Directed Acyclic Transfomer (DA-Transformer), which represents the hidden states in a Directed Acyclic Graph (DAG), where each path of the DAG corresponds to a specific translation. The whole DAG simultaneously captures multiple translations and facilitates fast predictions in a non-autoregressive fashion. Experiments on the raw training data of WMT benchmark show that DA-Transformer substantially outperforms previous NATs by about 3 BLEU on average, which is the first NAT model that achieves competitive results with autoregressive Transformers without relying on knowledge distillation.", "bibtex": "@InProceedings{pmlr-v162-huang22m,\n title = \t {Directed Acyclic Transformer for Non-Autoregressive Machine Translation},\n author = {Huang, Fei and Zhou, Hao and Liu, Yang and Li, Hang and Huang, Minlie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9410--9428},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22m/huang22m.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22m.html},\n abstract = \t {Non-autoregressive Transformers (NATs) significantly reduce the decoding latency by generating all tokens in parallel. However, such independent predictions prevent NATs from capturing the dependencies between the tokens for generating multiple possible translations. In this paper, we propose Directed Acyclic Transfomer (DA-Transformer), which represents the hidden states in a Directed Acyclic Graph (DAG), where each path of the DAG corresponds to a specific translation. The whole DAG simultaneously captures multiple translations and facilitates fast predictions in a non-autoregressive fashion. Experiments on the raw training data of WMT benchmark show that DA-Transformer substantially outperforms previous NATs by about 3 BLEU on average, which is the first NAT model that achieves competitive results with autoregressive Transformers without relying on knowledge distillation.}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22m/huang22m.pdf", "supp": "", "pdf_size": 1430368, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12752123369496105828&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "The CoAI group, Tsinghua University, China+Institute for Artificial Intelligence, State Key Lab of Intelligent Technology and Systems, Beijing National Research Center for Information Science and Technology, Department of Computer Science and Technology, Tsinghua University, China; ByteDance AI Lab; Institute for Artificial Intelligence, State Key Lab of Intelligent Technology and Systems, Beijing National Research Center for Information Science and Technology, Department of Computer Science and Technology, Tsinghua University, China; ByteDance AI Lab; The CoAI group, Tsinghua University, China+Institute for Artificial Intelligence, State Key Lab of Intelligent Technology and Systems, Beijing National Research Center for Information Science and Technology, Department of Computer Science and Technology, Tsinghua University, China", "aff_domain": "mails.tsinghua.edu.cn;gmail.com;tsinghua.edu.cn;bytedance.com;tsinghua.edu.cn", "email": "mails.tsinghua.edu.cn;gmail.com;tsinghua.edu.cn;bytedance.com;tsinghua.edu.cn", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/huang22m.html", "aff_unique_index": "0+0;1;0;1;0+0", "aff_unique_norm": "Tsinghua University;ByteDance", "aff_unique_dep": "The CoAI group;AI Lab", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.bytedance.com", "aff_unique_abbr": ";ByteDance", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0+0", "aff_country_unique": "China" }, { "title": "DisPFL: Towards Communication-Efficient Personalized Federated Learning via Decentralized Sparse Training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17221", "id": "17221", "proceeding": "https://proceedings.mlr.press/v162/dai22b.html", "poster": "/media/PosterPDFs/ICML%202022/99064ba6631e279d4a74622df99657d6.png?t=1657608326.4175107", "slides": "", "author_site": "Rong Dai, Li Shen, Fengxiang He, Xinmei Tian, Dacheng Tao", "author": "Rong Dai; Li Shen; Fengxiang He; Xinmei Tian; Dacheng Tao", "abstract": "Personalized federated learning is proposed to handle the data heterogeneity problem amongst clients by learning dedicated tailored local models for each user. However, existing works are often built in a centralized way, leading to high communication pressure and high vulnerability when a failure or an attack on the central server occurs. In this work, we propose a novel personalized federated learning framework in a decentralized (peer-to-peer) communication protocol named DisPFL, which employs personalized sparse masks to customize sparse local models on the edge. To further save the communication and computation cost, we propose a decentralized sparse training technique, which means that each local model in DisPFL only maintains a fixed number of active parameters throughout the whole local training and peer-to-peer communication process. Comprehensive experiments demonstrate that DisPFL significantly saves the communication bottleneck for the busiest node among all clients and, at the same time, achieves higher model accuracy with less computation cost and communication rounds. Furthermore, we demonstrate that our method can easily adapt to heterogeneous local clients with varying computation complexities and achieves better personalized performances.", "bibtex": "@InProceedings{pmlr-v162-dai22b,\n title = \t {{D}is{PFL}: Towards Communication-Efficient Personalized Federated Learning via Decentralized Sparse Training},\n author = {Dai, Rong and Shen, Li and He, Fengxiang and Tian, Xinmei and Tao, Dacheng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4587--4604},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dai22b/dai22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/dai22b.html},\n abstract = \t {Personalized federated learning is proposed to handle the data heterogeneity problem amongst clients by learning dedicated tailored local models for each user. However, existing works are often built in a centralized way, leading to high communication pressure and high vulnerability when a failure or an attack on the central server occurs. In this work, we propose a novel personalized federated learning framework in a decentralized (peer-to-peer) communication protocol named DisPFL, which employs personalized sparse masks to customize sparse local models on the edge. To further save the communication and computation cost, we propose a decentralized sparse training technique, which means that each local model in DisPFL only maintains a fixed number of active parameters throughout the whole local training and peer-to-peer communication process. Comprehensive experiments demonstrate that DisPFL significantly saves the communication bottleneck for the busiest node among all clients and, at the same time, achieves higher model accuracy with less computation cost and communication rounds. Furthermore, we demonstrate that our method can easily adapt to heterogeneous local clients with varying computation complexities and achieves better personalized performances.}\n}", "pdf": "https://proceedings.mlr.press/v162/dai22b/dai22b.pdf", "supp": "", "pdf_size": 9978649, "gs_citation": 165, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13590903827423118545&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "University of Science and Technology of China, Hefei, China; JD Explore Academy, Beijing, China; JD Explore Academy, Beijing, China; University of Science and Technology of China, Hefei, China + Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China; JD Explore Academy, Beijing, China", "aff_domain": "ustc.edu.cn;gmail.com; ; ; ", "email": "ustc.edu.cn;gmail.com; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/dai22b.html", "aff_unique_index": "0;1;1;0+2;1", "aff_unique_norm": "University of Science and Technology of China;JD;Hefei Comprehensive National Science Center", "aff_unique_dep": ";JD Explore Academy;Institute of Artificial Intelligence", "aff_unique_url": "http://www.ustc.edu.cn;;", "aff_unique_abbr": "USTC;;", "aff_campus_unique_index": "0;1;1;0+0;1", "aff_campus_unique": "Hefei;Beijing", "aff_country_unique_index": "0;0;0;0+0;0", "aff_country_unique": "China" }, { "title": "Discovering Generalizable Spatial Goal Representations via Graph-based Active Reward Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18241", "id": "18241", "proceeding": "https://proceedings.mlr.press/v162/netanyahu22a.html", "poster": "/media/PosterPDFs/ICML%202022/85ea6fd7a2ca3960d0cf5201933ac998_zQCRjXw.png?t=1657831408.1595387", "slides": "", "author_site": "Aviv Netanyahu, Tianmin Shu, Josh Tenenbaum, Pulkit Agrawal", "author": "Aviv Netanyahu; Tianmin Shu; Joshua Tenenbaum; Pulkit Agrawal", "abstract": "In this work, we consider one-shot imitation learning for object rearrangement tasks, where an AI agent needs to watch a single expert demonstration and learn to perform the same task in different environments. To achieve a strong generalization, the AI agent must infer the spatial goal specification for the task. However, there can be multiple goal specifications that fit the given demonstration. To address this, we propose a reward learning approach, Graph-based Equivalence Mappings (GEM), that can discover spatial goal representations that are aligned with the intended goal specification, enabling successful generalization in unseen environments. Specifically, GEM represents a spatial goal specification by a reward function conditioned on i) a graph indicating important spatial relationships between objects and ii) state equivalence mappings for each edge in the graph indicating invariant properties of the corresponding relationship. GEM combines inverse reinforcement learning and active reward learning to efficiently improve the reward function by utilizing the graph structure and domain randomization enabled by the equivalence mappings. We conducted experiments with simulated oracles and with human subjects. The results show that GEM can drastically improve the generalizability of the learned goal representations over strong baselines.", "bibtex": "@InProceedings{pmlr-v162-netanyahu22a,\n title = \t {Discovering Generalizable Spatial Goal Representations via Graph-based Active Reward Learning},\n author = {Netanyahu, Aviv and Shu, Tianmin and Tenenbaum, Joshua and Agrawal, Pulkit},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16480--16495},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/netanyahu22a/netanyahu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/netanyahu22a.html},\n abstract = \t {In this work, we consider one-shot imitation learning for object rearrangement tasks, where an AI agent needs to watch a single expert demonstration and learn to perform the same task in different environments. To achieve a strong generalization, the AI agent must infer the spatial goal specification for the task. However, there can be multiple goal specifications that fit the given demonstration. To address this, we propose a reward learning approach, Graph-based Equivalence Mappings (GEM), that can discover spatial goal representations that are aligned with the intended goal specification, enabling successful generalization in unseen environments. Specifically, GEM represents a spatial goal specification by a reward function conditioned on i) a graph indicating important spatial relationships between objects and ii) state equivalence mappings for each edge in the graph indicating invariant properties of the corresponding relationship. GEM combines inverse reinforcement learning and active reward learning to efficiently improve the reward function by utilizing the graph structure and domain randomization enabled by the equivalence mappings. We conducted experiments with simulated oracles and with human subjects. The results show that GEM can drastically improve the generalizability of the learned goal representations over strong baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/netanyahu22a/netanyahu22a.pdf", "supp": "", "pdf_size": 1246710, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16264479961241855750&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "aff": "Dept. of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, MA+Dept. of Brain and Cognitive Science, Massachusetts Institute of Technology, Cambridge, MA; Dept. of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, MA+Dept. of Brain and Cognitive Science, Massachusetts Institute of Technology, Cambridge, MA; Dept. of Brain and Cognitive Science, Massachusetts Institute of Technology, Cambridge, MA; Dept. of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, MA", "aff_domain": "mit.edu;mit.edu; ; ", "email": "mit.edu;mit.edu; ; ", "github": "", "project": "https://www.tshu.io/GEM", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/netanyahu22a.html", "aff_unique_index": "0+0;0+0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Dept. of Electrical Engineering and Computer Science", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0+0;0+0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0+0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "Discrete Probabilistic Inverse Optimal Transport", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16637", "id": "16637", "proceeding": "https://proceedings.mlr.press/v162/chiu22b.html", "poster": "/media/PosterPDFs/ICML%202022/53e19f3dbb211f20b20b45668303c1b6.png?t=1657833920.6303103", "slides": "", "author_site": "Wei-Ting Chiu, Pei Wang, Patrick Shafto", "author": "Wei-Ting Chiu; Pei Wang; Patrick Shafto", "abstract": "Inverse Optimal Transport (IOT) studies the problem of inferring the underlying cost that gives rise to an observation on coupling two probability measures. Couplings appear as the outcome of matching sets (e.g. dating) and moving distributions (e.g. transportation). Compared to Optimal transport (OT), the mathematical theory of IOT is undeveloped. We formalize and systematically analyze the properties of IOT using tools from the study of entropy-regularized OT. Theoretical contributions include characterization of the manifold of cross-ratio equivalent costs, the implications of model priors, and derivation of an MCMC sampler. Empirical contributions include visualizations of cross-ratio equivalent effect on basic examples, simulations validating theoretical results and experiments on real world data.", "bibtex": "@InProceedings{pmlr-v162-chiu22b,\n title = \t {Discrete Probabilistic Inverse Optimal Transport},\n author = {Chiu, Wei-Ting and Wang, Pei and Shafto, Patrick},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3925--3946},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chiu22b/chiu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/chiu22b.html},\n abstract = \t {Inverse Optimal Transport (IOT) studies the problem of inferring the underlying cost that gives rise to an observation on coupling two probability measures. Couplings appear as the outcome of matching sets (e.g. dating) and moving distributions (e.g. transportation). Compared to Optimal transport (OT), the mathematical theory of IOT is undeveloped. We formalize and systematically analyze the properties of IOT using tools from the study of entropy-regularized OT. Theoretical contributions include characterization of the manifold of cross-ratio equivalent costs, the implications of model priors, and derivation of an MCMC sampler. Empirical contributions include visualizations of cross-ratio equivalent effect on basic examples, simulations validating theoretical results and experiments on real world data.}\n}", "pdf": "https://proceedings.mlr.press/v162/chiu22b/chiu22b.pdf", "supp": "", "pdf_size": 3225537, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9176904387862477319&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Mathematics and Computer Science, Rutgers University Newark, NJ + School of Mathematics, Institute for Advanced Study (IAS), Princeton NJ; Department of Mathematics and Computer Science, Rutgers University Newark, NJ + School of Mathematics, Institute for Advanced Study (IAS), Princeton NJ; Department of Mathematics and Computer Science, Rutgers University Newark, NJ + School of Mathematics, Institute for Advanced Study (IAS), Princeton NJ", "aff_domain": "rutgers.edu;gmail.com;gmail.com", "email": "rutgers.edu;gmail.com;gmail.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chiu22b.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "Rutgers University;Institute for Advanced Study", "aff_unique_dep": "Department of Mathematics and Computer Science;School of Mathematics", "aff_unique_url": "https://www.rutgers.edu;https://www.ias.edu", "aff_unique_abbr": "Rutgers;IAS", "aff_campus_unique_index": "0+1;0+1;0+1", "aff_campus_unique": "Newark;Princeton", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "Discrete Tree Flows via Tree-Structured Permutations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16691", "id": "16691", "proceeding": "https://proceedings.mlr.press/v162/elkady22a.html", "poster": "/media/PosterPDFs/ICML%202022/f80bf05527157a8c2a7bb63b22f49aaa.png?t=1657997636.6093285", "slides": "", "author_site": "Mai Elkady, Jim Lim, David I. Inouye", "author": "Mai Elkady; Jim Lim; David I. Inouye", "abstract": "While normalizing flows for continuous data have been extensively researched, flows for discrete data have only recently been explored. These prior models, however, suffer from limitations that are distinct from those of continuous flows. Most notably, discrete flow-based models cannot be straightforwardly optimized with conventional deep learning methods because gradients of discrete functions are undefined or zero. Previous works approximate pseudo-gradients of the discrete functions but do not solve the problem on a fundamental level. In addition to that, backpropagation can be computationally burdensome compared to alternative discrete algorithms such as decision tree algorithms. Our approach seeks to reduce computational burden and remove the need for pseudo-gradients by developing a discrete flow based on decision trees\u2014building upon the success of efficient tree-based methods for classification and regression for discrete data. We first define a tree-structured permutation (TSP) that compactly encodes a permutation of discrete data where the inverse is easy to compute; thus, we can efficiently compute the density value and sample new data. We then propose a decision tree algorithm to build TSPs that learns the tree structure and permutations at each node via novel criteria. We empirically demonstrate the feasibility of our method on multiple datasets.", "bibtex": "@InProceedings{pmlr-v162-elkady22a,\n title = \t {Discrete Tree Flows via Tree-Structured Permutations},\n author = {Elkady, Mai and Lim, Jim and Inouye, David I.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5892--5923},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/elkady22a/elkady22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/elkady22a.html},\n abstract = \t {While normalizing flows for continuous data have been extensively researched, flows for discrete data have only recently been explored. These prior models, however, suffer from limitations that are distinct from those of continuous flows. Most notably, discrete flow-based models cannot be straightforwardly optimized with conventional deep learning methods because gradients of discrete functions are undefined or zero. Previous works approximate pseudo-gradients of the discrete functions but do not solve the problem on a fundamental level. In addition to that, backpropagation can be computationally burdensome compared to alternative discrete algorithms such as decision tree algorithms. Our approach seeks to reduce computational burden and remove the need for pseudo-gradients by developing a discrete flow based on decision trees\u2014building upon the success of efficient tree-based methods for classification and regression for discrete data. We first define a tree-structured permutation (TSP) that compactly encodes a permutation of discrete data where the inverse is easy to compute; thus, we can efficiently compute the density value and sample new data. We then propose a decision tree algorithm to build TSPs that learns the tree structure and permutations at each node via novel criteria. We empirically demonstrate the feasibility of our method on multiple datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/elkady22a/elkady22a.pdf", "supp": "", "pdf_size": 2592199, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16353286354064639558&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, Purdue University; Department of Electrical and Computer Engineering, Purdue University; Department of Electrical and Computer Engineering, Purdue University", "aff_domain": "purdue.edu;purdue.edu; ", "email": "purdue.edu;purdue.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/elkady22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Discriminator-Weighted Offline Imitation Learning from Suboptimal Demonstrations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17685", "id": "17685", "proceeding": "https://proceedings.mlr.press/v162/xu22l.html", "poster": "", "slides": "/media/icml-2022/Slides/17685.pdf", "author_site": "Haoran Xu, Xianyuan Zhan, Honglei Yin, Huiling qin", "author": "Haoran Xu; Xianyuan Zhan; Honglei Yin; Huiling Qin", "abstract": "We study the problem of offline Imitation Learning (IL) where an agent aims to learn an optimal expert behavior policy without additional online environment interactions. Instead, the agent is provided with a supplementary offline dataset from suboptimal behaviors. Prior works that address this problem either require that expert data occupies the majority proportion of the offline dataset, or need to learn a reward function and perform offline reinforcement learning (RL) afterwards. In this paper, we aim to address the problem without additional steps of reward learning and offline RL training for the case when demonstrations contain a large proportion of suboptimal data. Built upon behavioral cloning (BC), we introduce an additional discriminator to distinguish expert and non-expert data. We propose a cooperation framework to boost the learning of both tasks, Based on this framework, we design a new IL algorithm, where the outputs of the discriminator serve as the weights of the BC loss. Experimental results show that our proposed algorithm achieves higher returns and faster training speed compared to baseline algorithms.", "bibtex": "@InProceedings{pmlr-v162-xu22l,\n title = \t {Discriminator-Weighted Offline Imitation Learning from Suboptimal Demonstrations},\n author = {Xu, Haoran and Zhan, Xianyuan and Yin, Honglei and Qin, Huiling},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24725--24742},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22l/xu22l.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22l.html},\n abstract = \t {We study the problem of offline Imitation Learning (IL) where an agent aims to learn an optimal expert behavior policy without additional online environment interactions. Instead, the agent is provided with a supplementary offline dataset from suboptimal behaviors. Prior works that address this problem either require that expert data occupies the majority proportion of the offline dataset, or need to learn a reward function and perform offline reinforcement learning (RL) afterwards. In this paper, we aim to address the problem without additional steps of reward learning and offline RL training for the case when demonstrations contain a large proportion of suboptimal data. Built upon behavioral cloning (BC), we introduce an additional discriminator to distinguish expert and non-expert data. We propose a cooperation framework to boost the learning of both tasks, Based on this framework, we design a new IL algorithm, where the outputs of the discriminator serve as the weights of the BC loss. Experimental results show that our proposed algorithm achieves higher returns and faster training speed compared to baseline algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22l/xu22l.pdf", "supp": "", "pdf_size": 659169, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12184701455253705252&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "JD Technology, Beijing, China; Institute for AI Industry Research (AIR), Tsinghua University, Beijing, China; JD Technology, Beijing, China; JD Technology, Beijing, China", "aff_domain": "gmail.com;air.tsinghua.edu.cn; ; ", "email": "gmail.com;air.tsinghua.edu.cn; ; ", "github": "https://github.com/ryanxhr/DWBC", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/xu22l.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "JD;Tsinghua University", "aff_unique_dep": "JD Technology;Institute for AI Industry Research (AIR)", "aff_unique_url": "https://www.jd.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "JD;Tsinghua", "aff_campus_unique_index": "1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Disentangled Federated Learning for Tackling Attributes Skew via Invariant Aggregation and Diversity Transferring", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16881", "id": "16881", "proceeding": "https://proceedings.mlr.press/v162/luo22b.html", "poster": "/media/PosterPDFs/ICML%202022/087408522c31eeb1f982bc0eaf81d35f.png?t=1655645140.9102407", "slides": "/media/icml-2022/Slides/16881.pdf", "author_site": "Zhengquan Luo, Yunlong Wang, Zilei Wang, Zhenan Sun, Tieniu Tan", "author": "Zhengquan Luo; Yunlong Wang; Zilei Wang; Zhenan Sun; Tieniu Tan", "abstract": "Attributes skew hinders the current federated learning (FL) frameworks from consistent optimization directions among the clients, which inevitably leads to performance reduction and unstable convergence. The core problems lie in that: 1) Domain-specific attributes, which are non-causal and only locally valid, are indeliberately mixed into global aggregation. 2) The one-stage optimizations of entangled attributes cannot simultaneously satisfy two conflicting objectives, i.e., generalization and personalization. To cope with these, we proposed disentangled federated learning (DFL) to disentangle the domain-specific and cross-invariant attributes into two complementary branches, which are trained by the proposed alternating local-global optimization independently. Importantly, convergence analysis proves that the FL system can be stably converged even if incomplete client models participate in the global aggregation, which greatly expands the application scope of FL. Extensive experiments verify that DFL facilitates FL with higher performance, better interpretability, and faster convergence rate, compared with SOTA FL methods on both manually synthesized and realistic attributes skew datasets.", "bibtex": "@InProceedings{pmlr-v162-luo22b,\n title = \t {Disentangled Federated Learning for Tackling Attributes Skew via Invariant Aggregation and Diversity Transferring},\n author = {Luo, Zhengquan and Wang, Yunlong and Wang, Zilei and Sun, Zhenan and Tan, Tieniu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14527--14541},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/luo22b/luo22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/luo22b.html},\n abstract = \t {Attributes skew hinders the current federated learning (FL) frameworks from consistent optimization directions among the clients, which inevitably leads to performance reduction and unstable convergence. The core problems lie in that: 1) Domain-specific attributes, which are non-causal and only locally valid, are indeliberately mixed into global aggregation. 2) The one-stage optimizations of entangled attributes cannot simultaneously satisfy two conflicting objectives, i.e., generalization and personalization. To cope with these, we proposed disentangled federated learning (DFL) to disentangle the domain-specific and cross-invariant attributes into two complementary branches, which are trained by the proposed alternating local-global optimization independently. Importantly, convergence analysis proves that the FL system can be stably converged even if incomplete client models participate in the global aggregation, which greatly expands the application scope of FL. Extensive experiments verify that DFL facilitates FL with higher performance, better interpretability, and faster convergence rate, compared with SOTA FL methods on both manually synthesized and realistic attributes skew datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/luo22b/luo22b.pdf", "supp": "", "pdf_size": 7398825, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11534090670840869688&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "University of Science and Technology of China (USTC)+Institute of Automation, Chinese Academy of Sciences (CASIA); Institute of Automation, Chinese Academy of Sciences (CASIA); University of Science and Technology of China (USTC); Institute of Automation, Chinese Academy of Sciences (CASIA); Institute of Automation, Chinese Academy of Sciences (CASIA)", "aff_domain": "cripac.ia.ac.cn;cripac.ia.ac.cn;ustc.edu.cn;nlpr.ia.ac.cn;nlpr.ia.ac.cn", "email": "cripac.ia.ac.cn;cripac.ia.ac.cn;ustc.edu.cn;nlpr.ia.ac.cn;nlpr.ia.ac.cn", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/luo22b.html", "aff_unique_index": "0+1;1;0;1;1", "aff_unique_norm": "University of Science and Technology of China;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Automation", "aff_unique_url": "http://www.ustc.edu.cn;http://www.ia.cas.cn", "aff_unique_abbr": "USTC;CASIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Disentangling Disease-related Representation from Obscure for Disease Prediction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17211", "id": "17211", "proceeding": "https://proceedings.mlr.press/v162/wang22f.html", "poster": "/media/PosterPDFs/ICML%202022/4639475d6782a08c1e964f9a4329a254.png?t=1656665802.1628184", "slides": "", "author_site": "Chu-ran Wang, Fei Gao, Fandong Zhang, Fangwei Zhong, Yizhou Yu, Yizhou Wang", "author": "Chu-Ran Wang; Fei Gao; Fandong Zhang; Fangwei Zhong; Yizhou Yu; Yizhou Wang", "abstract": "Disease-related representations play a crucial role in image-based disease prediction such as cancer diagnosis, due to its considerable generalization capacity. However, it is still a challenge to identify lesion characteristics in obscured images, as many lesions are obscured by other tissues. In this paper, to learn the representations for identifying obscured lesions, we propose a disentanglement learning strategy under the guidance of alpha blending generation in an encoder-decoder framework (DAB-Net). Specifically, we take mammogram mass benign/malignant classification as an example. In our framework, composite obscured mass images are generated by alpha blending and then explicitly disentangled into disease-related mass features and interference glands features. To achieve disentanglement learning, features of these two parts are decoded to reconstruct the mass and the glands with corresponding reconstruction losses, and only disease-related mass features are fed into the classifier for disease prediction. Experimental results on one public dataset DDSM and three in-house datasets demonstrate that the proposed strategy can achieve state-of-the-art performance. DAB-Net achieves substantial improvements of 3.9%~4.4% AUC in obscured cases. Besides, the visualization analysis shows the model can better disentangle the mass and glands in the obscured image, suggesting the effectiveness of our solution in exploring the hidden characteristics in this challenging problem.", "bibtex": "@InProceedings{pmlr-v162-wang22f,\n title = \t {Disentangling Disease-related Representation from Obscure for Disease Prediction},\n author = {Wang, Chu-Ran and Gao, Fei and Zhang, Fandong and Zhong, Fangwei and Yu, Yizhou and Wang, Yizhou},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22652--22664},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22f/wang22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22f.html},\n abstract = \t {Disease-related representations play a crucial role in image-based disease prediction such as cancer diagnosis, due to its considerable generalization capacity. However, it is still a challenge to identify lesion characteristics in obscured images, as many lesions are obscured by other tissues. In this paper, to learn the representations for identifying obscured lesions, we propose a disentanglement learning strategy under the guidance of alpha blending generation in an encoder-decoder framework (DAB-Net). Specifically, we take mammogram mass benign/malignant classification as an example. In our framework, composite obscured mass images are generated by alpha blending and then explicitly disentangled into disease-related mass features and interference glands features. To achieve disentanglement learning, features of these two parts are decoded to reconstruct the mass and the glands with corresponding reconstruction losses, and only disease-related mass features are fed into the classifier for disease prediction. Experimental results on one public dataset DDSM and three in-house datasets demonstrate that the proposed strategy can achieve state-of-the-art performance. DAB-Net achieves substantial improvements of 3.9%~4.4% AUC in obscured cases. Besides, the visualization analysis shows the model can better disentangle the mass and glands in the obscured image, suggesting the effectiveness of our solution in exploring the hidden characteristics in this challenging problem.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22f/wang22f.pdf", "supp": "", "pdf_size": 3173891, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4394657962617872357&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wang22f.html" }, { "title": "Disentangling Sources of Risk for Distributional Multi-Agent Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17455", "id": "17455", "proceeding": "https://proceedings.mlr.press/v162/son22a.html", "poster": "", "slides": "", "author_site": "Kyunghwan Son, Junsu Kim, Sungsoo Ahn, Roben Delos Reyes, Yung Yi, Jinwoo Shin", "author": "Kyunghwan Son; Junsu Kim; Sungsoo Ahn; Roben D Delos Reyes; Yung Yi; Jinwoo Shin", "abstract": "In cooperative multi-agent reinforcement learning, the outcomes of agent-wise policies are highly stochastic due to the two sources of risk: (a) random actions taken by teammates and (b) random transition and rewards. Although the two sources have very distinct characteristics, existing frameworks are insufficient to control the risk-sensitivity of agent-wise policies in a disentangled manner. To this end, we propose Disentangled RIsk-sensitive Multi-Agent reinforcement learning (DRIMA) to separately access the risk sources. For example, our framework allows an agent to be optimistic with respect to teammates (who can prosocially adapt) but more risk-neutral with respect to the environment (which does not adapt). Our experiments demonstrate that DRIMA significantly outperforms prior state-of-the-art methods across various scenarios in the StarCraft Multi-agent Challenge environment. Notably, DRIMA shows robust performance where prior methods learn only a highly suboptimal policy, regardless of reward shaping, exploration scheduling, and noisy (random or adversarial) agents.", "bibtex": "@InProceedings{pmlr-v162-son22a,\n title = \t {Disentangling Sources of Risk for Distributional Multi-Agent Reinforcement Learning},\n author = {Son, Kyunghwan and Kim, Junsu and Ahn, Sungsoo and Reyes, Roben D Delos and Yi, Yung and Shin, Jinwoo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20347--20368},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/son22a/son22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/son22a.html},\n abstract = \t {In cooperative multi-agent reinforcement learning, the outcomes of agent-wise policies are highly stochastic due to the two sources of risk: (a) random actions taken by teammates and (b) random transition and rewards. Although the two sources have very distinct characteristics, existing frameworks are insufficient to control the risk-sensitivity of agent-wise policies in a disentangled manner. To this end, we propose Disentangled RIsk-sensitive Multi-Agent reinforcement learning (DRIMA) to separately access the risk sources. For example, our framework allows an agent to be optimistic with respect to teammates (who can prosocially adapt) but more risk-neutral with respect to the environment (which does not adapt). Our experiments demonstrate that DRIMA significantly outperforms prior state-of-the-art methods across various scenarios in the StarCraft Multi-agent Challenge environment. Notably, DRIMA shows robust performance where prior methods learn only a highly suboptimal policy, regardless of reward shaping, exploration scheduling, and noisy (random or adversarial) agents.}\n}", "pdf": "https://proceedings.mlr.press/v162/son22a/son22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/son22a-supp.zip", "pdf_size": 10656263, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8857655750744467124&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Korea Advanced Institute of Science and Technology (KAIST); Korea Advanced Institute of Science and Technology (KAIST); Pohang University of Science and Technology (POSTECH); Korea Advanced Institute of Science and Technology (KAIST); Korea Advanced Institute of Science and Technology (KAIST); Korea Advanced Institute of Science and Technology (KAIST)", "aff_domain": "kaist.ac.kr; ; ; ; ; ", "email": "kaist.ac.kr; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/son22a.html", "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;Pohang University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.postech.ac.kr", "aff_unique_abbr": "KAIST;POSTECH", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pohang", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Distinguishing rule and exemplar-based generalization in learning systems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16693", "id": "16693", "proceeding": "https://proceedings.mlr.press/v162/dasgupta22b.html", "poster": "/media/PosterPDFs/ICML%202022/59f51fd6937412b7e56ded1ea2470c25.png?t=1657830660.7668748", "slides": "", "author_site": "Ishita Dasgupta, Erin Grant, Thomas Griffiths", "author": "Ishita Dasgupta; Erin Grant; Tom Griffiths", "abstract": "Machine learning systems often do not share the same inductive biases as humans and, as a result, extrapolate or generalize in ways that are inconsistent with our expectations. The trade-off between exemplar- and rule-based generalization has been studied extensively in cognitive psychology; in this work, we present a protocol inspired by these experimental approaches to probe the inductive biases that control this trade-off in category-learning systems such as artificial neural networks. We isolate two such inductive biases: feature-level bias (differences in which features are more readily learned) and exemplar-vs-rule bias (differences in how these learned features are used for generalization of category labels). We find that standard neural network models are feature-biased and have a propensity towards exemplar-based extrapolation; we discuss the implications of these findings for machine-learning research on data augmentation, fairness, and systematic generalization.", "bibtex": "@InProceedings{pmlr-v162-dasgupta22b,\n title = \t {Distinguishing rule and exemplar-based generalization in learning systems},\n author = {Dasgupta, Ishita and Grant, Erin and Griffiths, Tom},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4816--4830},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dasgupta22b/dasgupta22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/dasgupta22b.html},\n abstract = \t {Machine learning systems often do not share the same inductive biases as humans and, as a result, extrapolate or generalize in ways that are inconsistent with our expectations. The trade-off between exemplar- and rule-based generalization has been studied extensively in cognitive psychology; in this work, we present a protocol inspired by these experimental approaches to probe the inductive biases that control this trade-off in category-learning systems such as artificial neural networks. We isolate two such inductive biases: feature-level bias (differences in which features are more readily learned) and exemplar-vs-rule bias (differences in how these learned features are used for generalization of category labels). We find that standard neural network models are feature-biased and have a propensity towards exemplar-based extrapolation; we discuss the implications of these findings for machine-learning research on data augmentation, fairness, and systematic generalization.}\n}", "pdf": "https://proceedings.mlr.press/v162/dasgupta22b/dasgupta22b.pdf", "supp": "", "pdf_size": 2433642, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8589429647517094065&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Departments of Psychology & Computer Science, Princeton University; Department of Electrical Engineering & Computer Sciences, UC Berkeley + DeepMind; Departments of Psychology & Computer Science, Princeton University", "aff_domain": "deepmind.com;berkeley.edu; ", "email": "deepmind.com;berkeley.edu; ", "github": "https://github.com/eringrant/icml-2022-rules-vs-exemplars", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/dasgupta22b.html", "aff_unique_index": "0;1+2;0", "aff_unique_norm": "Princeton University;University of California, Berkeley;DeepMind", "aff_unique_dep": "Departments of Psychology & Computer Science;Department of Electrical Engineering & Computer Sciences;", "aff_unique_url": "https://www.princeton.edu;https://www.berkeley.edu;https://deepmind.com", "aff_unique_abbr": "Princeton;UC Berkeley;DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0+1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Distribution Regression with Sliced Wasserstein Kernels", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17913", "id": "17913", "proceeding": "https://proceedings.mlr.press/v162/meunier22b.html", "poster": "/media/PosterPDFs/ICML%202022/afe434653a898da20044041262b3ac74.png?t=1658364742.1174154", "slides": "", "author_site": "Dimitri Marie Meunier, Massimiliano Pontil, Carlo Ciliberto", "author": "Dimitri Meunier; Massimiliano Pontil; Carlo Ciliberto", "abstract": "The problem of learning functions over spaces of probabilities - or distribution regression - is gaining significant interest in the machine learning community. The main challenge in these settings is to identify a suitable representation capturing all relevant properties of a distribution. The well-established approach in this sense is to use kernel mean embeddings, which lift kernel-induced similarity on the input domain at the probability level. This strategy effectively tackles the two-stage sampling nature of the problem, enabling one to derive estimators with strong statistical guarantees, such as universal consistency and excess risk bounds. However, kernel mean embeddings implicitly hinge on the maximum mean discrepancy (MMD), a metric on probabilities, which is not the most suited to capture geometrical relations between distributions. In contrast, optimal transport (OT) metrics, are potentially more appealing. In this work, we propose an OT-based estimator for distribution regression. We build on the Sliced Wasserstein distance to obtain an OT-based representation. We study the theoretical properties of a kernel ridge regression estimator based on such representation, for which we prove universal consistency and excess risk bounds. Preliminary experiments complement our theoretical findings by showing the effectiveness of the proposed approach and compare it with MMD-based estimators.", "bibtex": "@InProceedings{pmlr-v162-meunier22b,\n title = \t {Distribution Regression with Sliced {W}asserstein Kernels},\n author = {Meunier, Dimitri and Pontil, Massimiliano and Ciliberto, Carlo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15501--15523},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/meunier22b/meunier22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/meunier22b.html},\n abstract = \t {The problem of learning functions over spaces of probabilities - or distribution regression - is gaining significant interest in the machine learning community. The main challenge in these settings is to identify a suitable representation capturing all relevant properties of a distribution. The well-established approach in this sense is to use kernel mean embeddings, which lift kernel-induced similarity on the input domain at the probability level. This strategy effectively tackles the two-stage sampling nature of the problem, enabling one to derive estimators with strong statistical guarantees, such as universal consistency and excess risk bounds. However, kernel mean embeddings implicitly hinge on the maximum mean discrepancy (MMD), a metric on probabilities, which is not the most suited to capture geometrical relations between distributions. In contrast, optimal transport (OT) metrics, are potentially more appealing. In this work, we propose an OT-based estimator for distribution regression. We build on the Sliced Wasserstein distance to obtain an OT-based representation. We study the theoretical properties of a kernel ridge regression estimator based on such representation, for which we prove universal consistency and excess risk bounds. Preliminary experiments complement our theoretical findings by showing the effectiveness of the proposed approach and compare it with MMD-based estimators.}\n}", "pdf": "https://proceedings.mlr.press/v162/meunier22b/meunier22b.pdf", "supp": "", "pdf_size": 647620, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6056433376162861662&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Gatsby Computational Neuroscience Unit, University College London, London, United Kingdom; Italian Institute of Technology, Genoa, Italy + Department of Computer Science, University College London, London, United Kingdom; Department of Computer Science, University College London, London, United Kingdom", "aff_domain": "ucl.ac.uk; ; ", "email": "ucl.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/meunier22b.html", "aff_unique_index": "0;1+0;0", "aff_unique_norm": "University College London;Italian Institute of Technology", "aff_unique_dep": "Gatsby Computational Neuroscience Unit;", "aff_unique_url": "https://www.ucl.ac.uk;https://www.iit.it", "aff_unique_abbr": "UCL;IIT", "aff_campus_unique_index": "0;1+0;0", "aff_campus_unique": "London;Genoa", "aff_country_unique_index": "0;1+0;0", "aff_country_unique": "United Kingdom;Italy" }, { "title": "Distributional Hamilton-Jacobi-Bellman Equations for Continuous-Time Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17075", "id": "17075", "proceeding": "https://proceedings.mlr.press/v162/wiltzer22a.html", "poster": "/media/PosterPDFs/ICML%202022/b52340b4de4566b804c9880aa0b4af5f.png?t=1657551406.300302", "slides": "", "author_site": "Harley Wiltzer, David Meger, Marc Bellemare", "author": "Harley E Wiltzer; David Meger; Marc G. Bellemare", "abstract": "Continuous-time reinforcement learning offers an appealing formalism for describing control problems in which the passage of time is not naturally divided into discrete increments. Here we consider the problem of predicting the distribution of returns obtained by an agent interacting in a continuous-time, stochastic environment. Accurate return predictions have proven useful for determining optimal policies for risk-sensitive control, learning state representations, multiagent coordination, and more. We begin by establishing the distributional analogue of the Hamilton-Jacobi-Bellman (HJB) equation for Ito diffusions and the broader class of Feller-Dynkin processes. We then specialize this equation to the setting in which the return distribution is approximated by N uniformly-weighted particles, a common design choice in distributional algorithms. Our derivation highlights additional terms due to statistical diffusivity which arise from the proper handling of distributions in the continuous-time setting. Based on this, we propose a tractable algorithm for approximately solving the distributional HJB based on a JKO scheme, which can be implemented in an online, control algorithm. We demonstrate the effectiveness of such an algorithm in a synthetic control problem.", "bibtex": "@InProceedings{pmlr-v162-wiltzer22a,\n title = \t {Distributional {H}amilton-Jacobi-{B}ellman Equations for Continuous-Time Reinforcement Learning},\n author = {Wiltzer, Harley E and Meger, David and Bellemare, Marc G.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23832--23856},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wiltzer22a/wiltzer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/wiltzer22a.html},\n abstract = \t {Continuous-time reinforcement learning offers an appealing formalism for describing control problems in which the passage of time is not naturally divided into discrete increments. Here we consider the problem of predicting the distribution of returns obtained by an agent interacting in a continuous-time, stochastic environment. Accurate return predictions have proven useful for determining optimal policies for risk-sensitive control, learning state representations, multiagent coordination, and more. We begin by establishing the distributional analogue of the Hamilton-Jacobi-Bellman (HJB) equation for Ito diffusions and the broader class of Feller-Dynkin processes. We then specialize this equation to the setting in which the return distribution is approximated by N uniformly-weighted particles, a common design choice in distributional algorithms. Our derivation highlights additional terms due to statistical diffusivity which arise from the proper handling of distributions in the continuous-time setting. Based on this, we propose a tractable algorithm for approximately solving the distributional HJB based on a JKO scheme, which can be implemented in an online, control algorithm. We demonstrate the effectiveness of such an algorithm in a synthetic control problem.}\n}", "pdf": "https://proceedings.mlr.press/v162/wiltzer22a/wiltzer22a.pdf", "supp": "", "pdf_size": 890453, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12270788519434759693&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "McGill University, Montreal, Canada+Mila \u2013 Quebec AI Institute+Google Brain, Montreal, Canada+CIFAR Fellow; McGill University, Montreal, Canada+Mila \u2013 Quebec AI Institute; Mila \u2013 Quebec AI Institute+Google Brain, Montreal, Canada+CIFAR Fellow", "aff_domain": "mail.mcgill.ca; ; ", "email": "mail.mcgill.ca; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wiltzer22a.html", "aff_unique_index": "0+1+2+3;0+1;1+2+3", "aff_unique_norm": "McGill University;Quebec AI Institute;Google;CIFAR", "aff_unique_dep": ";AI;Google Brain;", "aff_unique_url": "https://www.mcgill.ca;https://mila.quebec;https://brain.google.com;https://www.cifar.ca", "aff_unique_abbr": "McGill;Mila;Google Brain;CIFAR", "aff_campus_unique_index": "0+0;0;0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0+0+0+0;0+0;0+0+0", "aff_country_unique": "Canada" }, { "title": "Distributionally Robust $Q$-Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16889", "id": "16889", "proceeding": "https://proceedings.mlr.press/v162/liu22a.html", "poster": "/media/PosterPDFs/ICML%202022/cd0cbcc668fe4bc58e0af3cc7e0a653d.png?t=1657396305.5396066", "slides": "", "author_site": "Zijian Liu, Jerry Bai, Jose Blanchet, Perry Dong, Wei Xu, Zhengqing Zhou, Zhengyuan Zhou", "author": "Zijian Liu; Qinxun Bai; Jose Blanchet; Perry Dong; Wei Xu; Zhengqing Zhou; Zhengyuan Zhou", "abstract": "Reinforcement learning (RL) has demonstrated remarkable achievements in simulated environments. However, carrying this success to real environments requires the important attribute of robustness, which the existing RL algorithms often lack as they assume that the future deployment environment is the same as the training environment (i.e. simulator) in which the policy is learned. This assumption often does not hold due to the discrepancy between the simulator and the real environment and, as a result, and hence renders the learned policy fragile when deployed. In this paper, we propose a novel distributionally robust $Q$-learning algorithm that learns the best policy in the worst distributional perturbation of the environment. Our algorithm first transforms the infinite-dimensional learning problem (since the environment MDP perturbation lies in an infinite-dimensional space) into a finite-dimensional dual problem and subsequently uses a multi-level Monte-Carlo scheme to approximate the dual value using samples from the simulator. Despite the complexity, we show that the resulting distributionally robust $Q$-learning algorithm asymptotically converges to optimal worst-case policy, thus making it robust to future environment changes. Simulation results further demonstrate its strong empirical robustness.", "bibtex": "@InProceedings{pmlr-v162-liu22a,\n title = \t {Distributionally Robust $Q$-Learning},\n author = {Liu, Zijian and Bai, Qinxun and Blanchet, Jose and Dong, Perry and Xu, Wei and Zhou, Zhengqing and Zhou, Zhengyuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13623--13643},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22a/liu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22a.html},\n abstract = \t {Reinforcement learning (RL) has demonstrated remarkable achievements in simulated environments. However, carrying this success to real environments requires the important attribute of robustness, which the existing RL algorithms often lack as they assume that the future deployment environment is the same as the training environment (i.e. simulator) in which the policy is learned. This assumption often does not hold due to the discrepancy between the simulator and the real environment and, as a result, and hence renders the learned policy fragile when deployed. In this paper, we propose a novel distributionally robust $Q$-learning algorithm that learns the best policy in the worst distributional perturbation of the environment. Our algorithm first transforms the infinite-dimensional learning problem (since the environment MDP perturbation lies in an infinite-dimensional space) into a finite-dimensional dual problem and subsequently uses a multi-level Monte-Carlo scheme to approximate the dual value using samples from the simulator. Despite the complexity, we show that the resulting distributionally robust $Q$-learning algorithm asymptotically converges to optimal worst-case policy, thus making it robust to future environment changes. Simulation results further demonstrate its strong empirical robustness.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22a/liu22a.pdf", "supp": "", "pdf_size": 534732, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12028615700632416246&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/liu22a.html" }, { "title": "Distributionally-Aware Kernelized Bandit Problems for Risk Aversion", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17223", "id": "17223", "proceeding": "https://proceedings.mlr.press/v162/takemori22a.html", "poster": "/media/PosterPDFs/ICML%202022/678a1491514b7f1006d605e9161946b1_iJkLGNI.png?t=1658376639.6951663", "slides": "/media/icml-2022/Slides/17223.pdf", "author": "Sho Takemori", "abstract": "The kernelized bandit problem is a theoretically justified framework and has solid applications to various fields. Recently, there is a growing interest in generalizing the problem to the optimization of risk-averse metrics such as Conditional Value-at-Risk (CVaR) or Mean-Variance (MV). However, due to the model assumption, most existing methods need explicit design of environment random variables and can incur large regret because of possible high dimensionality of them. To address the issues, in this paper, we model environments using a family of the output distributions (or more precisely, probability kernel) and Kernel Mean Embeddings (KME), and provide novel UCB-type algorithms for CVaR and MV. Moreover, we provide algorithm-independent lower bounds for CVaR in the case of Mat\u00e9rn kernels, and propose a nearly optimal algorithm. Furthermore, we empirically verify our theoretical result in synthetic environments, and demonstrate that our proposed method significantly outperforms a baseline in many cases.", "bibtex": "@InProceedings{pmlr-v162-takemori22a,\n title = \t {Distributionally-Aware Kernelized Bandit Problems for Risk Aversion},\n author = {Takemori, Sho},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20933--20959},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/takemori22a/takemori22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/takemori22a.html},\n abstract = \t {The kernelized bandit problem is a theoretically justified framework and has solid applications to various fields. Recently, there is a growing interest in generalizing the problem to the optimization of risk-averse metrics such as Conditional Value-at-Risk (CVaR) or Mean-Variance (MV). However, due to the model assumption, most existing methods need explicit design of environment random variables and can incur large regret because of possible high dimensionality of them. To address the issues, in this paper, we model environments using a family of the output distributions (or more precisely, probability kernel) and Kernel Mean Embeddings (KME), and provide novel UCB-type algorithms for CVaR and MV. Moreover, we provide algorithm-independent lower bounds for CVaR in the case of Mat\u00e9rn kernels, and propose a nearly optimal algorithm. Furthermore, we empirically verify our theoretical result in synthetic environments, and demonstrate that our proposed method significantly outperforms a baseline in many cases.}\n}", "pdf": "https://proceedings.mlr.press/v162/takemori22a/takemori22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/takemori22a-supp.zip", "pdf_size": 524987, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12352746230102073114&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Fujitsu Ltd., Kawasaki, Japan", "aff_domain": "fujitsu.com", "email": "fujitsu.com", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/takemori22a.html", "aff_unique_index": "0", "aff_unique_norm": "Fujitsu Limited", "aff_unique_dep": "", "aff_unique_url": "https://www.fujitsu.com", "aff_unique_abbr": "Fujitsu", "aff_campus_unique_index": "0", "aff_campus_unique": "Kawasaki", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "title": "Divergence-Regularized Multi-Agent Actor-Critic", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16021", "id": "16021", "proceeding": "https://proceedings.mlr.press/v162/su22b.html", "poster": "/media/PosterPDFs/ICML%202022/d042be1b4b72c110d21287b3dad13867.png?t=1657532457.0577192", "slides": "", "author_site": "Kefan Su, Zongqing Lu", "author": "Kefan Su; Zongqing Lu", "abstract": "Entropy regularization is a popular method in reinforcement learning (RL). Although it has many advantages, it alters the RL objective and makes the converged policy deviate from the optimal policy of the original Markov Decision Process (MDP). Though divergence regularization has been proposed to settle this problem, it cannot be trivially applied to cooperative multi-agent reinforcement learning (MARL). In this paper, we investigate divergence regularization in cooperative MARL and propose a novel off-policy cooperative MARL framework, divergence-regularized multi-agent actor-critic (DMAC). Theoretically, we derive the update rule of DMAC which is naturally off-policy, guarantees the monotonic policy improvement and convergence in both the original MDP and the divergence-regularized MDP, and is not biased by the regularization. We also give a bound of the discrepancy between the converged policy and the optimal policy in the original MDP. DMAC is a flexible framework and can be combined with many existing MARL algorithms. Empirically, we evaluate DMAC in a didactic stochastic game and StarCraft Multi-Agent Challenge and show that DMAC substantially improves the performance of existing MARL algorithms.", "bibtex": "@InProceedings{pmlr-v162-su22b,\n title = \t {Divergence-Regularized Multi-Agent Actor-Critic},\n author = {Su, Kefan and Lu, Zongqing},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20580--20603},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/su22b/su22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/su22b.html},\n abstract = \t {Entropy regularization is a popular method in reinforcement learning (RL). Although it has many advantages, it alters the RL objective and makes the converged policy deviate from the optimal policy of the original Markov Decision Process (MDP). Though divergence regularization has been proposed to settle this problem, it cannot be trivially applied to cooperative multi-agent reinforcement learning (MARL). In this paper, we investigate divergence regularization in cooperative MARL and propose a novel off-policy cooperative MARL framework, divergence-regularized multi-agent actor-critic (DMAC). Theoretically, we derive the update rule of DMAC which is naturally off-policy, guarantees the monotonic policy improvement and convergence in both the original MDP and the divergence-regularized MDP, and is not biased by the regularization. We also give a bound of the discrepancy between the converged policy and the optimal policy in the original MDP. DMAC is a flexible framework and can be combined with many existing MARL algorithms. Empirically, we evaluate DMAC in a didactic stochastic game and StarCraft Multi-Agent Challenge and show that DMAC substantially improves the performance of existing MARL algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/su22b/su22b.pdf", "supp": "", "pdf_size": 952714, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2549353071483835929&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "School of Computer Science, Peking University; School of Computer Science, Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn", "email": "pku.edu.cn;pku.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/su22b.html", "aff_unique_index": "0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "School of Computer Science", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "PKU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Diversified Adversarial Attacks based on Conjugate Gradient Method", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16955", "id": "16955", "proceeding": "https://proceedings.mlr.press/v162/yamamura22a.html", "poster": "/media/PosterPDFs/ICML%202022/53e3a7161e428b65688f14b84d61c610_mYk646f.png?t=1658237272.5393374", "slides": "/media/icml-2022/Slides/16955.pdf", "author_site": "Keiichiro Yamamura, Haruki Sato, Nariaki Tateiwa, Nozomi Hata, Toru Mitsutake, Issa Oe, Hiroki Ishikura, Katsuki Fujisawa", "author": "Keiichiro Yamamura; Haruki Sato; Nariaki Tateiwa; Nozomi Hata; Toru Mitsutake; Issa Oe; Hiroki Ishikura; Katsuki Fujisawa", "abstract": "Deep learning models are vulnerable to adversarial examples, and adversarial attacks used to generate such examples have attracted considerable research interest. Although existing methods based on the steepest descent have achieved high attack success rates, ill-conditioned problems occasionally reduce their performance. To address this limitation, we utilize the conjugate gradient (CG) method, which is effective for this type of problem, and propose a novel attack algorithm inspired by the CG method, named the Auto Conjugate Gradient (ACG) attack. The results of large-scale evaluation experiments conducted on the latest robust models show that, for most models, ACG was able to find more adversarial examples with fewer iterations than the existing SOTA algorithm Auto-PGD (APGD). We investigated the difference in search performance between ACG and APGD in terms of diversification and intensification, and define a measure called Diversity Index (DI) to quantify the degree of diversity. From the analysis of the diversity using this index, we show that the more diverse search of the proposed method remarkably improves its attack success rate.", "bibtex": "@InProceedings{pmlr-v162-yamamura22a,\n title = \t {Diversified Adversarial Attacks based on Conjugate Gradient Method},\n author = {Yamamura, Keiichiro and Sato, Haruki and Tateiwa, Nariaki and Hata, Nozomi and Mitsutake, Toru and Oe, Issa and Ishikura, Hiroki and Fujisawa, Katsuki},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24872--24894},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yamamura22a/yamamura22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yamamura22a.html},\n abstract = \t {Deep learning models are vulnerable to adversarial examples, and adversarial attacks used to generate such examples have attracted considerable research interest. Although existing methods based on the steepest descent have achieved high attack success rates, ill-conditioned problems occasionally reduce their performance. To address this limitation, we utilize the conjugate gradient (CG) method, which is effective for this type of problem, and propose a novel attack algorithm inspired by the CG method, named the Auto Conjugate Gradient (ACG) attack. The results of large-scale evaluation experiments conducted on the latest robust models show that, for most models, ACG was able to find more adversarial examples with fewer iterations than the existing SOTA algorithm Auto-PGD (APGD). We investigated the difference in search performance between ACG and APGD in terms of diversification and intensification, and define a measure called Diversity Index (DI) to quantify the degree of diversity. From the analysis of the diversity using this index, we show that the more diverse search of the proposed method remarkably improves its attack success rate.}\n}", "pdf": "https://proceedings.mlr.press/v162/yamamura22a/yamamura22a.pdf", "supp": "", "pdf_size": 4054226, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13855220363786968422&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Graduate School of Mathematics, Kyushu University, Fukuoka, Japan; Graduate School of Mathematics, Kyushu University, Fukuoka, Japan; Graduate School of Mathematics, Kyushu University, Fukuoka, Japan + NTT Software Innovation Center, NTT Corporation; Institute of Mathematics for Industry, Kyushu University, Fukuoka, Japan; Graduate School of Mathematics, Kyushu University, Fukuoka, Japan; Graduate School of Mathematics, Kyushu University, Fukuoka, Japan; Graduate School of Mathematics, Kyushu University, Fukuoka, Japan; Institute of Mathematics for Industry, Kyushu University, Fukuoka, Japan", "aff_domain": "kyudai.jp; ; ; ; ; ; ;", "email": "kyudai.jp; ; ; ; ; ; ;", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/yamamura22a.html", "aff_unique_index": "0;0;0+1;0;0;0;0;0", "aff_unique_norm": "Kyushu University;NTT Corporation", "aff_unique_dep": "Graduate School of Mathematics;NTT Software Innovation Center", "aff_unique_url": "https://www.kyushu-u.ac.jp;https://www.ntt.co.jp", "aff_unique_abbr": "Kyushu U;NTT", "aff_campus_unique_index": "0;0;0;0;0;0;0;0", "aff_campus_unique": "Fukuoka;", "aff_country_unique_index": "0;0;0+0;0;0;0;0;0", "aff_country_unique": "Japan" }, { "title": "Do Differentiable Simulators Give Better Policy Gradients?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16769", "id": "16769", "proceeding": "https://proceedings.mlr.press/v162/suh22b.html", "poster": "/media/PosterPDFs/ICML%202022/6097d8f3714205740f30debe1166744e.png?t=1658021830.5482602", "slides": "", "author_site": "Hyung Ju Suh, Max Simchowitz, Kaiqing Zhang, Russ Tedrake", "author": "Hyung Ju Suh; Max Simchowitz; Kaiqing Zhang; Russ Tedrake", "abstract": "Differentiable simulators promise faster computation time for reinforcement learning by replacing zeroth-order gradient estimates of a stochastic objective with an estimate based on first-order gradients. However, it is yet unclear what factors decide the performance of the two estimators on complex landscapes that involve long-horizon planning and control on physical systems, despite the crucial relevance of this question for the utility of differentiable simulators. We show that characteristics of certain physical systems, such as stiffness or discontinuities, may compromise the efficacy of the first-order estimator, and analyze this phenomenon through the lens of bias and variance. We additionally propose an $\\alpha$-order gradient estimator, with $\\alpha \\in [0,1]$, which correctly utilizes exact gradients to combine the efficiency of first-order estimates with the robustness of zero-order methods. We demonstrate the pitfalls of traditional estimators and the advantages of the $\\alpha$-order estimator on some numerical examples.", "bibtex": "@InProceedings{pmlr-v162-suh22b,\n title = \t {Do Differentiable Simulators Give Better Policy Gradients?},\n author = {Suh, Hyung Ju and Simchowitz, Max and Zhang, Kaiqing and Tedrake, Russ},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20668--20696},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/suh22b/suh22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/suh22b.html},\n abstract = \t {Differentiable simulators promise faster computation time for reinforcement learning by replacing zeroth-order gradient estimates of a stochastic objective with an estimate based on first-order gradients. However, it is yet unclear what factors decide the performance of the two estimators on complex landscapes that involve long-horizon planning and control on physical systems, despite the crucial relevance of this question for the utility of differentiable simulators. We show that characteristics of certain physical systems, such as stiffness or discontinuities, may compromise the efficacy of the first-order estimator, and analyze this phenomenon through the lens of bias and variance. We additionally propose an $\\alpha$-order gradient estimator, with $\\alpha \\in [0,1]$, which correctly utilizes exact gradients to combine the efficiency of first-order estimates with the robustness of zero-order methods. We demonstrate the pitfalls of traditional estimators and the advantages of the $\\alpha$-order estimator on some numerical examples.}\n}", "pdf": "https://proceedings.mlr.press/v162/suh22b/suh22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/suh22b-supp.zip", "pdf_size": 3364883, "gs_citation": 127, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16263865349473703430&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, USA; Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, USA; Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, USA; Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, USA", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/suh22b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Department of Electrical Engineering and Computer Science", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Do More Negative Samples Necessarily Hurt In Contrastive Learning?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18255", "id": "18255", "proceeding": "https://proceedings.mlr.press/v162/awasthi22b.html", "poster": "/media/PosterPDFs/ICML%202022/405075699f065e43581f27d67bb68478.png?t=1658188516.2928522", "slides": "", "author_site": "Pranjal Awasthi, Nishanth Dikkala, Pritish Kamath", "author": "Pranjal Awasthi; Nishanth Dikkala; Pritish Kamath", "abstract": "Recent investigations in noise contrastive estimation suggest, both empirically as well as theoretically, that while having more \u201cnegative samples\u201d in the contrastive loss improves downstream classification performance initially, beyond a threshold, it hurts downstream performance due to a \u201ccollision-coverage\u201d trade-off. But is such a phenomenon inherent in contrastive learning? We show in a simple theoretical setting, where positive pairs are generated by sampling from the underlying latent class (introduced by Saunshi et al. (ICML 2019)), that the downstream performance of the representation optimizing the (population) contrastive loss in fact does not degrade with the number of negative samples. Along the way, we give a structural characterization of the optimal representation in our framework, for noise contrastive estimation. We also provide empirical support for our theoretical results on CIFAR-10 and CIFAR-100 datasets.", "bibtex": "@InProceedings{pmlr-v162-awasthi22b,\n title = \t {Do More Negative Samples Necessarily Hurt In Contrastive Learning?},\n author = {Awasthi, Pranjal and Dikkala, Nishanth and Kamath, Pritish},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1101--1116},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/awasthi22b/awasthi22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/awasthi22b.html},\n abstract = \t {Recent investigations in noise contrastive estimation suggest, both empirically as well as theoretically, that while having more \u201cnegative samples\u201d in the contrastive loss improves downstream classification performance initially, beyond a threshold, it hurts downstream performance due to a \u201ccollision-coverage\u201d trade-off. But is such a phenomenon inherent in contrastive learning? We show in a simple theoretical setting, where positive pairs are generated by sampling from the underlying latent class (introduced by Saunshi et al. (ICML 2019)), that the downstream performance of the representation optimizing the (population) contrastive loss in fact does not degrade with the number of negative samples. Along the way, we give a structural characterization of the optimal representation in our framework, for noise contrastive estimation. We also provide empirical support for our theoretical results on CIFAR-10 and CIFAR-100 datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/awasthi22b/awasthi22b.pdf", "supp": "", "pdf_size": 618536, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15098943763361391941&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Google Research, USA; Google Research, USA; Google Research, USA", "aff_domain": "google.com;alum.mit.edu; ", "email": "google.com;alum.mit.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/awasthi22b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Does the Data Induce Capacity Control in Deep Learning?", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16831", "id": "16831", "proceeding": "https://proceedings.mlr.press/v162/yang22k.html", "poster": "/media/PosterPDFs/ICML%202022/2d405b367158e3f12d7c1e31a96b3af3.png?t=1657684004.3593733", "slides": "", "author_site": "Rubing Yang, Jialin Mao, Pratik Chaudhari", "author": "Rubing Yang; Jialin Mao; Pratik Chaudhari", "abstract": "We show that the input correlation matrix of typical classification datasets has an eigenspectrum where, after a sharp initial drop, a large number of small eigenvalues are distributed uniformly over an exponentially large range. This structure is mirrored in a network trained on this data: we show that the Hessian and the Fisher Information Matrix (FIM) have eigenvalues that are spread uniformly over exponentially large ranges. We call such eigenspectra \u201csloppy\u201d because sets of weights corresponding to small eigenvalues can be changed by large magnitudes without affecting the loss. Networks trained on atypical datasets with non-sloppy inputs do not share these traits and deep networks trained on such datasets generalize poorly. Inspired by this, we study the hypothesis that sloppiness of inputs aids generalization in deep networks. We show that if the Hessian is sloppy, we can compute non-vacuous PAC-Bayes generalization bounds analytically. By exploiting our empirical observation that training predominantly takes place in the non-sloppy subspace of the FIM, we develop data-distribution dependent PAC-Bayes priors that lead to accurate generalization bounds using numerical optimization.", "bibtex": "@InProceedings{pmlr-v162-yang22k,\n title = \t {Does the Data Induce Capacity Control in Deep Learning?},\n author = {Yang, Rubing and Mao, Jialin and Chaudhari, Pratik},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25166--25197},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22k/yang22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22k.html},\n abstract = \t {We show that the input correlation matrix of typical classification datasets has an eigenspectrum where, after a sharp initial drop, a large number of small eigenvalues are distributed uniformly over an exponentially large range. This structure is mirrored in a network trained on this data: we show that the Hessian and the Fisher Information Matrix (FIM) have eigenvalues that are spread uniformly over exponentially large ranges. We call such eigenspectra \u201csloppy\u201d because sets of weights corresponding to small eigenvalues can be changed by large magnitudes without affecting the loss. Networks trained on atypical datasets with non-sloppy inputs do not share these traits and deep networks trained on such datasets generalize poorly. Inspired by this, we study the hypothesis that sloppiness of inputs aids generalization in deep networks. We show that if the Hessian is sloppy, we can compute non-vacuous PAC-Bayes generalization bounds analytically. By exploiting our empirical observation that training predominantly takes place in the non-sloppy subspace of the FIM, we develop data-distribution dependent PAC-Bayes priors that lead to accurate generalization bounds using numerical optimization.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22k/yang22k.pdf", "supp": "", "pdf_size": 1495664, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=884919534291840762&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Applied Mathematics and Computational Science, University of Pennsylvania; Applied Mathematics and Computational Science, University of Pennsylvania; Electrical and Systems Engineering, University of Pennsylvania", "aff_domain": "sas.upenn.edu; ; ", "email": "sas.upenn.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/yang22k.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "Department of Applied Mathematics and Computational Science", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Domain Adaptation for Time Series Forecasting via Attention Sharing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16937", "id": "16937", "proceeding": "https://proceedings.mlr.press/v162/jin22d.html", "poster": "/media/PosterPDFs/ICML%202022/334467d41d5cf21e234465a1530ba647.png?t=1658363210.2527797", "slides": "", "author_site": "Xiaoyong Jin, Youngsuk Park, Danielle Robinson, Hao Wang, Yuyang Wang", "author": "Xiaoyong Jin; Youngsuk Park; Danielle Maddix; Hao Wang; Yuyang Wang", "abstract": "Recently, deep neural networks have gained increasing popularity in the field of time series forecasting. A primary reason for their success is their ability to effectively capture complex temporal dynamics across multiple related time series. The advantages of these deep forecasters only start to emerge in the presence of a sufficient amount of data. This poses a challenge for typical forecasting problems in practice, where there is a limited number of time series or observations per time series, or both. To cope with this data scarcity issue, we propose a novel domain adaptation framework, Domain Adaptation Forecaster (DAF). DAF leverages statistical strengths from a relevant domain with abundant data samples (source) to improve the performance on the domain of interest with limited data (target). In particular, we use an attention-based shared module with a domain discriminator across domains and private modules for individual domains. We induce domain-invariant latent features (queries and keys) and retrain domain-specific features (values) simultaneously to enable joint training of forecasters on source and target domains. A main insight is that our design of aligning keys allows the target domain to leverage source time series even with different characteristics. Extensive experiments on various domains demonstrate that our proposed method outperforms state-of-the-art baselines on synthetic and real-world datasets, and ablation studies verify the effectiveness of our design choices.", "bibtex": "@InProceedings{pmlr-v162-jin22d,\n title = \t {Domain Adaptation for Time Series Forecasting via Attention Sharing},\n author = {Jin, Xiaoyong and Park, Youngsuk and Maddix, Danielle and Wang, Hao and Wang, Yuyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10280--10297},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jin22d/jin22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/jin22d.html},\n abstract = \t {Recently, deep neural networks have gained increasing popularity in the field of time series forecasting. A primary reason for their success is their ability to effectively capture complex temporal dynamics across multiple related time series. The advantages of these deep forecasters only start to emerge in the presence of a sufficient amount of data. This poses a challenge for typical forecasting problems in practice, where there is a limited number of time series or observations per time series, or both. To cope with this data scarcity issue, we propose a novel domain adaptation framework, Domain Adaptation Forecaster (DAF). DAF leverages statistical strengths from a relevant domain with abundant data samples (source) to improve the performance on the domain of interest with limited data (target). In particular, we use an attention-based shared module with a domain discriminator across domains and private modules for individual domains. We induce domain-invariant latent features (queries and keys) and retrain domain-specific features (values) simultaneously to enable joint training of forecasters on source and target domains. A main insight is that our design of aligning keys allows the target domain to leverage source time series even with different characteristics. Extensive experiments on various domains demonstrate that our proposed method outperforms state-of-the-art baselines on synthetic and real-world datasets, and ablation studies verify the effectiveness of our design choices.}\n}", "pdf": "https://proceedings.mlr.press/v162/jin22d/jin22d.pdf", "supp": "", "pdf_size": 1194313, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18199756569119739890&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, University of California Santa Barbara, California, USA (work done during internship at Amazon AWS AI); Amazon AWS AI; Amazon AWS AI; Rutgers University; Amazon AWS AI", "aff_domain": "cs.ucsb.edu; ; ; ; ", "email": "cs.ucsb.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/jin22d.html", "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "University of California Santa Barbara;Amazon;Rutgers University", "aff_unique_dep": "Department of Computer Science;Amazon Web Services AI;", "aff_unique_url": "https://www.ucsb.edu;https://aws.amazon.com;https://www.rutgers.edu", "aff_unique_abbr": "UCSB;AWS;Rutgers", "aff_campus_unique_index": "0", "aff_campus_unique": "Santa Barbara;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Double Sampling Randomized Smoothing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18329", "id": "18329", "proceeding": "https://proceedings.mlr.press/v162/li22aa.html", "poster": "/media/PosterPDFs/ICML%202022/cb8a08a240f3ea7c99b220d24f54f477.png?t=1657870402.5050657", "slides": "", "author_site": "Linyi Li, Jiawei Zhang, Tao Xie, Bo Li", "author": "Linyi Li; Jiawei Zhang; Tao Xie; Bo Li", "abstract": "Neural networks (NNs) are known to be vulnerable against adversarial perturbations, and thus there is a line of work aiming to provide robustness certification for NNs, such as randomized smoothing, which samples smoothing noises from a certain distribution to certify the robustness for a smoothed classifier. However, as previous work shows, the certified robust radius in randomized smoothing suffers from scaling to large datasets (\"curse of dimensionality\"). To overcome this hurdle, we propose a Double Sampling Randomized Smoothing (DSRS) framework, which exploits the sampled probability from an additional smoothing distribution to tighten the robustness certification of the previous smoothed classifier. Theoretically, under mild assumptions, we prove that DSRS can certify $\\Theta(\\sqrt d)$ robust radius under $\\ell_2$ norm where $d$ is the input dimension, which implies that DSRS may be able to break the curse of dimensionality of randomized smoothing. We instantiate DSRS for a generalized family of Gaussian smoothing and propose an efficient and sound computing method based on customized dual optimization considering sampling error. Extensive experiments on MNIST, CIFAR-10, and ImageNet verify our theory and show that DSRS certifies larger robust radii than existing baselines consistently under different settings. Code is available at https://github.com/llylly/DSRS.", "bibtex": "@InProceedings{pmlr-v162-li22aa,\n title = \t {Double Sampling Randomized Smoothing},\n author = {Li, Linyi and Zhang, Jiawei and Xie, Tao and Li, Bo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13163--13208},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22aa/li22aa.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22aa.html},\n abstract = \t {Neural networks (NNs) are known to be vulnerable against adversarial perturbations, and thus there is a line of work aiming to provide robustness certification for NNs, such as randomized smoothing, which samples smoothing noises from a certain distribution to certify the robustness for a smoothed classifier. However, as previous work shows, the certified robust radius in randomized smoothing suffers from scaling to large datasets (\"curse of dimensionality\"). To overcome this hurdle, we propose a Double Sampling Randomized Smoothing (DSRS) framework, which exploits the sampled probability from an additional smoothing distribution to tighten the robustness certification of the previous smoothed classifier. Theoretically, under mild assumptions, we prove that DSRS can certify $\\Theta(\\sqrt d)$ robust radius under $\\ell_2$ norm where $d$ is the input dimension, which implies that DSRS may be able to break the curse of dimensionality of randomized smoothing. We instantiate DSRS for a generalized family of Gaussian smoothing and propose an efficient and sound computing method based on customized dual optimization considering sampling error. Extensive experiments on MNIST, CIFAR-10, and ImageNet verify our theory and show that DSRS certifies larger robust radii than existing baselines consistently under different settings. Code is available at https://github.com/llylly/DSRS.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22aa/li22aa.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/li22aa-supp.zip", "pdf_size": 16513171, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13905428147766407509&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "University of Illinois Urbana-Champaign; University of Illinois Urbana-Champaign; Peking University; University of Illinois Urbana-Champaign", "aff_domain": "illinois.edu; ; ; ", "email": "illinois.edu; ; ; ", "github": "https://github.com/llylly/DSRS", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/li22aa.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Peking University", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;http://www.pku.edu.cn", "aff_unique_abbr": "UIUC;Peking U", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Doubly Robust Distributionally Robust Off-Policy Evaluation and Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16749", "id": "16749", "proceeding": "https://proceedings.mlr.press/v162/kallus22a.html", "poster": "/media/PosterPDFs/ICML%202022/f48c04ffab49ff0e5d1176244fdfb65c.png?t=1657848057.1112387", "slides": "", "author_site": "Nathan Kallus, Xiaojie Mao, Kaiwen Wang, Zhengyuan Zhou", "author": "Nathan Kallus; Xiaojie Mao; Kaiwen Wang; Zhengyuan Zhou", "abstract": "Off-policy evaluation and learning (OPE/L) use offline observational data to make better decisions, which is crucial in applications where online experimentation is limited. However, depending entirely on logged data, OPE/L is sensitive to environment distribution shifts \u2014 discrepancies between the data-generating environment and that where policies are deployed. Si et al., (2020) proposed distributionally robust OPE/L (DROPE/L) to address this, but the proposal relies on inverse-propensity weighting, whose estimation error and regret will deteriorate if propensities are nonparametrically estimated and whose variance is suboptimal even if not. For standard, non-robust, OPE/L, this is solved by doubly robust (DR) methods, but they do not naturally extend to the more complex DROPE/L, which involves a worst-case expectation. In this paper, we propose the first DR algorithms for DROPE/L with KL-divergence uncertainty sets. For evaluation, we propose Localized Doubly Robust DROPE (LDR$^2$OPE) and show that it achieves semiparametric efficiency under weak product rates conditions. Thanks to a localization technique, LDR$^2$OPE only requires fitting a small number of regressions, just like DR methods for standard OPE. For learning, we propose Continuum Doubly Robust DROPL (CDR$^2$OPL) and show that, under a product rate condition involving a continuum of regressions, it enjoys a fast regret rate of $O(N^{-1/2})$ even when unknown propensities are nonparametrically estimated. We empirically validate our algorithms in simulations and further extend our results to general $f$-divergence uncertainty sets.", "bibtex": "@InProceedings{pmlr-v162-kallus22a,\n title = \t {Doubly Robust Distributionally Robust Off-Policy Evaluation and Learning},\n author = {Kallus, Nathan and Mao, Xiaojie and Wang, Kaiwen and Zhou, Zhengyuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10598--10632},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kallus22a/kallus22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kallus22a.html},\n abstract = \t {Off-policy evaluation and learning (OPE/L) use offline observational data to make better decisions, which is crucial in applications where online experimentation is limited. However, depending entirely on logged data, OPE/L is sensitive to environment distribution shifts \u2014 discrepancies between the data-generating environment and that where policies are deployed. Si et al., (2020) proposed distributionally robust OPE/L (DROPE/L) to address this, but the proposal relies on inverse-propensity weighting, whose estimation error and regret will deteriorate if propensities are nonparametrically estimated and whose variance is suboptimal even if not. For standard, non-robust, OPE/L, this is solved by doubly robust (DR) methods, but they do not naturally extend to the more complex DROPE/L, which involves a worst-case expectation. In this paper, we propose the first DR algorithms for DROPE/L with KL-divergence uncertainty sets. For evaluation, we propose Localized Doubly Robust DROPE (LDR$^2$OPE) and show that it achieves semiparametric efficiency under weak product rates conditions. Thanks to a localization technique, LDR$^2$OPE only requires fitting a small number of regressions, just like DR methods for standard OPE. For learning, we propose Continuum Doubly Robust DROPL (CDR$^2$OPL) and show that, under a product rate condition involving a continuum of regressions, it enjoys a fast regret rate of $O(N^{-1/2})$ even when unknown propensities are nonparametrically estimated. We empirically validate our algorithms in simulations and further extend our results to general $f$-divergence uncertainty sets.}\n}", "pdf": "https://proceedings.mlr.press/v162/kallus22a/kallus22a.pdf", "supp": "", "pdf_size": 826605, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3538177620069646339&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Cornell University and Cornell Tech; Tsinghua University; Cornell University and Cornell Tech; Arena Technologies and New York University", "aff_domain": "cornell.edu; ; ; ", "email": "cornell.edu; ; ; ", "github": "https://kaiwenw.github.io", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/kallus22a.html", "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Cornell University;Tsinghua University;New York University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cornell.edu;https://www.tsinghua.edu.cn;https://www.nyu.edu", "aff_unique_abbr": "Cornell;THU;NYU", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Ithaca;;New York", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "DreamerPro: Reconstruction-Free Model-Based Reinforcement Learning with Prototypical Representations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17995", "id": "17995", "proceeding": "https://proceedings.mlr.press/v162/deng22a.html", "poster": "/media/PosterPDFs/ICML%202022/1de7d2b90d554be9f0db1c338e80197d.png?t=1656731767.9536588", "slides": "/media/icml-2022/Slides/17995.pdf", "author_site": "Fei Deng, Ingook Jang, Sungjin Ahn", "author": "Fei Deng; Ingook Jang; Sungjin Ahn", "abstract": "Reconstruction-based Model-Based Reinforcement Learning (MBRL) agents, such as Dreamer, often fail to discard task-irrelevant visual distractions that are prevalent in natural scenes. In this paper, we propose a reconstruction-free MBRL agent, called DreamerPro, that can enhance robustness to distractions. Motivated by the recent success of prototypical representations, a non-contrastive self-supervised learning approach in computer vision, DreamerPro combines Dreamer with prototypes. In order for the prototypes to benefit temporal dynamics learning in MBRL, we propose to additionally learn the prototypes from the recurrent states of the world model, thereby distilling temporal structures from past observations and actions into the prototypes. Experiments on the DeepMind Control suite show that DreamerPro achieves better overall performance than state-of-the-art contrastive MBRL agents when there are complex background distractions, and maintains similar performance as Dreamer in standard tasks where contrastive MBRL agents can perform much worse.", "bibtex": "@InProceedings{pmlr-v162-deng22a,\n title = \t {{D}reamer{P}ro: Reconstruction-Free Model-Based Reinforcement Learning with Prototypical Representations},\n author = {Deng, Fei and Jang, Ingook and Ahn, Sungjin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4956--4975},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/deng22a/deng22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/deng22a.html},\n abstract = \t {Reconstruction-based Model-Based Reinforcement Learning (MBRL) agents, such as Dreamer, often fail to discard task-irrelevant visual distractions that are prevalent in natural scenes. In this paper, we propose a reconstruction-free MBRL agent, called DreamerPro, that can enhance robustness to distractions. Motivated by the recent success of prototypical representations, a non-contrastive self-supervised learning approach in computer vision, DreamerPro combines Dreamer with prototypes. In order for the prototypes to benefit temporal dynamics learning in MBRL, we propose to additionally learn the prototypes from the recurrent states of the world model, thereby distilling temporal structures from past observations and actions into the prototypes. Experiments on the DeepMind Control suite show that DreamerPro achieves better overall performance than state-of-the-art contrastive MBRL agents when there are complex background distractions, and maintains similar performance as Dreamer in standard tasks where contrastive MBRL agents can perform much worse.}\n}", "pdf": "https://proceedings.mlr.press/v162/deng22a/deng22a.pdf", "supp": "", "pdf_size": 3140802, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11064573461444670693&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, Rutgers University; ETRI; School of Computing, KAIST", "aff_domain": "rutgers.edu; ;kaist.ac.kr", "email": "rutgers.edu; ;kaist.ac.kr", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/deng22a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Rutgers University;Electronics and Telecommunications Research Institute;KAIST", "aff_unique_dep": "Department of Computer Science;;School of Computing", "aff_unique_url": "https://www.rutgers.edu;https://www.etri.re.kr;https://www.kaist.ac.kr", "aff_unique_abbr": "Rutgers;ETRI;KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;South Korea" }, { "title": "Dual Decomposition of Convex Optimization Layers for Consistent Attention in Medical Images", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17537", "id": "17537", "proceeding": "https://proceedings.mlr.press/v162/ron22a.html", "poster": "", "slides": "", "author_site": "Tom Ron, Tamir Hazan", "author": "Tom Ron; Tamir Hazan", "abstract": "A key concern in integrating machine learning models in medicine is the ability to interpret their reasoning. Popular explainability methods have demonstrated satisfactory results in natural image recognition, yet in medical image analysis, many of these approaches provide partial and noisy explanations. Recently, attention mechanisms have shown compelling results both in their predictive performance and in their interpretable qualities. A fundamental trait of attention is that it leverages salient parts of the input which contribute to the model\u2019s prediction. To this end, our work focuses on the explanatory value of attention weight distributions. We propose a multi-layer attention mechanism that enforces consistent interpretations between attended convolutional layers using convex optimization. We apply duality to decompose the consistency constraints between the layers by reparameterizing their attention probability distributions. We further suggest learning the dual witness by optimizing with respect to our objective; thus, our implementation uses standard back-propagation, hence it is highly efficient. While preserving predictive performance, our proposed method leverages weakly annotated medical imaging data and provides complete and faithful explanations to the model\u2019s prediction.", "bibtex": "@InProceedings{pmlr-v162-ron22a,\n title = \t {Dual Decomposition of Convex Optimization Layers for Consistent Attention in Medical Images},\n author = {Ron, Tom and Hazan, Tamir},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18754--18769},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ron22a/ron22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ron22a.html},\n abstract = \t {A key concern in integrating machine learning models in medicine is the ability to interpret their reasoning. Popular explainability methods have demonstrated satisfactory results in natural image recognition, yet in medical image analysis, many of these approaches provide partial and noisy explanations. Recently, attention mechanisms have shown compelling results both in their predictive performance and in their interpretable qualities. A fundamental trait of attention is that it leverages salient parts of the input which contribute to the model\u2019s prediction. To this end, our work focuses on the explanatory value of attention weight distributions. We propose a multi-layer attention mechanism that enforces consistent interpretations between attended convolutional layers using convex optimization. We apply duality to decompose the consistency constraints between the layers by reparameterizing their attention probability distributions. We further suggest learning the dual witness by optimizing with respect to our objective; thus, our implementation uses standard back-propagation, hence it is highly efficient. While preserving predictive performance, our proposed method leverages weakly annotated medical imaging data and provides complete and faithful explanations to the model\u2019s prediction.}\n}", "pdf": "https://proceedings.mlr.press/v162/ron22a/ron22a.pdf", "supp": "", "pdf_size": 2454927, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10465544337215443782&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": ";", "aff_domain": ";", "email": ";", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/ron22a.html" }, { "title": "Dual Perspective of Label-Specific Feature Learning for Multi-Label Classification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16631", "id": "16631", "proceeding": "https://proceedings.mlr.press/v162/hang22a.html", "poster": "/media/PosterPDFs/ICML%202022/9fd98f856d3ca2086168f264a117ed7c.png?t=1657359350.78021", "slides": "", "author_site": "Jun-Yi Hang, Min-Ling Zhang", "author": "Jun-Yi Hang; Min-Ling Zhang", "abstract": "Label-specific features serve as an effective strategy to facilitate multi-label classification, which account for the distinct discriminative properties of each class label via tailoring its own features. Existing approaches implement this strategy in a quite straightforward way, i.e. finding the most pertinent and discriminative features for each class label and directly inducing classifiers on constructed label-specific features. In this paper, we propose a dual perspective for label-specific feature learning, where label-specific discriminative properties are considered by identifying each label\u2019s own non-informative features and making the discrimination process immutable to variations of these features. To instantiate it, we present a perturbation-based approach DELA to provide classifiers with label-specific immutability on simultaneously identified non-informative features, which is optimized towards a probabilistically-relaxed expected risk minimization problem. Comprehensive experiments on 10 benchmark data sets show that our approach outperforms the state-of-the-art counterparts.", "bibtex": "@InProceedings{pmlr-v162-hang22a,\n title = \t {Dual Perspective of Label-Specific Feature Learning for Multi-Label Classification},\n author = {Hang, Jun-Yi and Zhang, Min-Ling},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8375--8386},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hang22a/hang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hang22a.html},\n abstract = \t {Label-specific features serve as an effective strategy to facilitate multi-label classification, which account for the distinct discriminative properties of each class label via tailoring its own features. Existing approaches implement this strategy in a quite straightforward way, i.e. finding the most pertinent and discriminative features for each class label and directly inducing classifiers on constructed label-specific features. In this paper, we propose a dual perspective for label-specific feature learning, where label-specific discriminative properties are considered by identifying each label\u2019s own non-informative features and making the discrimination process immutable to variations of these features. To instantiate it, we present a perturbation-based approach DELA to provide classifiers with label-specific immutability on simultaneously identified non-informative features, which is optimized towards a probabilistically-relaxed expected risk minimization problem. Comprehensive experiments on 10 benchmark data sets show that our approach outperforms the state-of-the-art counterparts.}\n}", "pdf": "https://proceedings.mlr.press/v162/hang22a/hang22a.pdf", "supp": "", "pdf_size": 750845, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5804405397707598095&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Computer Science and Engineering, Southeast University, Nanjing 210096, China+Key Laboratory of Computer Network and Information Integration (Southeast University), Ministry of Education, China; School of Computer Science and Engineering, Southeast University, Nanjing 210096, China+Key Laboratory of Computer Network and Information Integration (Southeast University), Ministry of Education, China", "aff_domain": "seu.edu.cn;seu.edu.cn", "email": "seu.edu.cn;seu.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/hang22a.html", "aff_unique_index": "0+0;0+0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "School of Computer Science and Engineering", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Nanjing;", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "China" }, { "title": "DynaMixer: A Vision MLP Architecture with Dynamic Mixing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17199", "id": "17199", "proceeding": "https://proceedings.mlr.press/v162/wang22i.html", "poster": "/media/PosterPDFs/ICML%202022/7cbbc409ec990f19c78c75bd1e06f215.png?t=1656223541.675687", "slides": "/media/icml-2022/Slides/17199.pdf", "author_site": "Ziyu Wang, Wenhao Jiang, Yiming Zhu, Li Yuan, Yibing Song, Wei Liu", "author": "Ziyu Wang; Wenhao Jiang; Yiming M Zhu; Li Yuan; Yibing Song; Wei Liu", "abstract": "Recently, MLP-like vision models have achieved promising performances on mainstream visual recognition tasks. In contrast with vision transformers and CNNs, the success of MLP-like models shows that simple information fusion operations among tokens and channels can yield a good representation power for deep recognition models. However, existing MLP-like models fuse tokens through static fusion operations, lacking adaptability to the contents of the tokens to be mixed. Thus, customary information fusion procedures are not effective enough. To this end, this paper presents an efficient MLP-like network architecture, dubbed DynaMixer, resorting to dynamic information fusion. Critically, we propose a procedure, on which the DynaMixer model relies, to dynamically generate mixing matrices by leveraging the contents of all the tokens to be mixed. To reduce the time complexity and improve the robustness, a dimensionality reduction technique and a multi-segment fusion mechanism are adopted. Our proposed DynaMixer model (97M parameters) achieves 84.3% top-1 accuracy on the ImageNet-1K dataset without extra training data, performing favorably against the state-of-the-art vision MLP models. When the number of parameters is reduced to 26M, it still achieves 82.7% top-1 accuracy, surpassing the existing MLP-like models with a similar capacity. The code is available at \\url{https://github.com/ziyuwwang/DynaMixer}.", "bibtex": "@InProceedings{pmlr-v162-wang22i,\n title = \t {{D}yna{M}ixer: A Vision {MLP} Architecture with Dynamic Mixing},\n author = {Wang, Ziyu and Jiang, Wenhao and Zhu, Yiming M and Yuan, Li and Song, Yibing and Liu, Wei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22691--22701},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22i/wang22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22i.html},\n abstract = \t {Recently, MLP-like vision models have achieved promising performances on mainstream visual recognition tasks. In contrast with vision transformers and CNNs, the success of MLP-like models shows that simple information fusion operations among tokens and channels can yield a good representation power for deep recognition models. However, existing MLP-like models fuse tokens through static fusion operations, lacking adaptability to the contents of the tokens to be mixed. Thus, customary information fusion procedures are not effective enough. To this end, this paper presents an efficient MLP-like network architecture, dubbed DynaMixer, resorting to dynamic information fusion. Critically, we propose a procedure, on which the DynaMixer model relies, to dynamically generate mixing matrices by leveraging the contents of all the tokens to be mixed. To reduce the time complexity and improve the robustness, a dimensionality reduction technique and a multi-segment fusion mechanism are adopted. Our proposed DynaMixer model (97M parameters) achieves 84.3% top-1 accuracy on the ImageNet-1K dataset without extra training data, performing favorably against the state-of-the-art vision MLP models. When the number of parameters is reduced to 26M, it still achieves 82.7% top-1 accuracy, surpassing the existing MLP-like models with a similar capacity. The code is available at \\url{https://github.com/ziyuwwang/DynaMixer}.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22i/wang22i.pdf", "supp": "", "pdf_size": 493820, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9756910838903336255&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Data Platform, Tencent; Data Platform, Tencent; Graduate school at ShenZhen, Tsinghua university; School of Electrical and Computer Engineering, Peking University; Tencent AI Lab; Data Platform, Tencent", "aff_domain": "tencent.com;gmail.com;tsinghua.edu.cn;pku.edu.cn;tencent.com;columbia.edu", "email": "tencent.com;gmail.com;tsinghua.edu.cn;pku.edu.cn;tencent.com;columbia.edu", "github": "https://github.com/ziyuwwang/DynaMixer", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wang22i.html", "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "Tencent;Tsinghua University;Peking University", "aff_unique_dep": "Data Platform;Graduate School;School of Electrical and Computer Engineering", "aff_unique_url": "https://www.tencent.com;http://www.tsinghua.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "Tencent;THU;PKU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Shenzhen;Beijing", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Dynamic Regret of Online Markov Decision Processes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17645", "id": "17645", "proceeding": "https://proceedings.mlr.press/v162/zhao22c.html", "poster": "/media/PosterPDFs/ICML%202022/cdaeb1282d614772beb1e74c192bebda.png?t=1657353930.6262703", "slides": "", "author_site": "Peng Zhao, Long-Fei Li, Zhi-Hua Zhou", "author": "Peng Zhao; Long-Fei Li; Zhi-Hua Zhou", "abstract": "We investigate online Markov Decision Processes\u00a0(MDPs) with adversarially changing loss functions and known transitions. We choose", "bibtex": "@InProceedings{pmlr-v162-zhao22c,\n title = \t {Dynamic Regret of Online {M}arkov Decision Processes},\n author = {Zhao, Peng and Li, Long-Fei and Zhou, Zhi-Hua},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26865--26894},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhao22c/zhao22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhao22c.html},\n abstract = \t {We investigate online Markov Decision Processes\u00a0(MDPs) with adversarially changing loss functions and known transitions. We choose", "pdf": "https://proceedings.mlr.press/v162/zhao22c/zhao22c.pdf", "supp": "", "pdf_size": 550103, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2916684397539911889&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "National Key Laboratory for Novel Software Technology, Nanjing University; National Key Laboratory for Novel Software Technology, Nanjing University; National Key Laboratory for Novel Software Technology, Nanjing University", "aff_domain": "lamda.nju.edu.cn;lamda.nju.edu.cn;lamda.nju.edu.cn", "email": "lamda.nju.edu.cn;lamda.nju.edu.cn;lamda.nju.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhao22c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "National Key Laboratory for Novel Software Technology", "aff_unique_url": "http://www.nju.edu.cn", "aff_unique_abbr": "Nanjing University", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Dynamic Topic Models for Temporal Document Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16123", "id": "16123", "proceeding": "https://proceedings.mlr.press/v162/zhang22n.html", "poster": "/media/PosterPDFs/ICML%202022/a14ac55a4f27472c5d894ec1c3c743d2.png?t=1657866414.9598155", "slides": "", "author_site": "Delvin Zhang, Hady Lauw", "author": "Delvin Ce Zhang; Hady Lauw", "abstract": "Dynamic topic models explore the time evolution of topics in temporally accumulative corpora. While existing topic models focus on the dynamics of individual documents, we propose two neural topic models aimed at learning unified topic distributions that incorporate both document dynamics and network structure. For the first model, by adding a time dimension, we propose Time-Aware Optimal Transport, which measures the probability of a link between two differently timestamped documents using their semantic distance. Since the gradually evolving topological structure of network may also influence the establishment of a new link, for the second model, we further design a Temporal Point Process to capture the impact of historical neighbors on the current link formation at the network level. Experiments on four dynamic document networks demonstrate the advantage of our models in jointly modeling document dynamics and network adjacency.", "bibtex": "@InProceedings{pmlr-v162-zhang22n,\n title = \t {Dynamic Topic Models for Temporal Document Networks},\n author = {Zhang, Delvin Ce and Lauw, Hady},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26281--26292},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22n/zhang22n.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22n.html},\n abstract = \t {Dynamic topic models explore the time evolution of topics in temporally accumulative corpora. While existing topic models focus on the dynamics of individual documents, we propose two neural topic models aimed at learning unified topic distributions that incorporate both document dynamics and network structure. For the first model, by adding a time dimension, we propose Time-Aware Optimal Transport, which measures the probability of a link between two differently timestamped documents using their semantic distance. Since the gradually evolving topological structure of network may also influence the establishment of a new link, for the second model, we further design a Temporal Point Process to capture the impact of historical neighbors on the current link formation at the network level. Experiments on four dynamic document networks demonstrate the advantage of our models in jointly modeling document dynamics and network adjacency.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22n/zhang22n.pdf", "supp": "", "pdf_size": 947195, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3028409348758783164&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "School of Computing and Information Systems, Singapore Management University; School of Computing and Information Systems, Singapore Management University", "aff_domain": "smu.edu.sg; ", "email": "smu.edu.sg; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/zhang22n.html", "aff_unique_index": "0;0", "aff_unique_norm": "Singapore Management University", "aff_unique_dep": "School of Computing and Information Systems", "aff_unique_url": "https://www.smu.edu.sg", "aff_unique_abbr": "SMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Singapore" }, { "title": "EAT-C: Environment-Adversarial sub-Task Curriculum for Efficient Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16147", "id": "16147", "proceeding": "https://proceedings.mlr.press/v162/ao22a.html", "poster": "/media/PosterPDFs/ICML%202022/c7e1249ffc03eb9ded908c236bd1996d_yZWeHqU.png?t=1657322431.6423042", "slides": "", "author_site": "Shuang Ao, Tianyi Zhou, Jing Jiang, Guodong Long, Xuan Song, Chengqi Zhang", "author": "Shuang Ao; Tianyi Zhou; Jing Jiang; Guodong Long; Xuan Song; Chengqi Zhang", "abstract": "Reinforcement learning (RL) is inefficient on long-horizon tasks due to sparse rewards and its policy can be fragile to slightly perturbed environments. We address these challenges via a curriculum of tasks with coupled environments, generated by two policies trained jointly with RL: (1) a co-operative planning policy recursively decomposing a hard task into a coarse-to-fine sub-task tree; and (2) an adversarial policy modifying the environment in each sub-task. They are complementary to acquire more informative feedback for RL: (1) provides dense reward of easier sub-tasks while (2) modifies sub-tasks\u2019 environments to be more challenging and diverse. Conversely, they are trained by RL\u2019s dense feedback on sub-tasks so their generated curriculum keeps adaptive to RL\u2019s progress. The sub-task tree enables an easy-to-hard curriculum for every policy: its top-down construction gradually increases sub-tasks the planner needs to generate, while the adversarial training between the environment and RL follows a bottom-up traversal that starts from a dense sequence of easier sub-tasks allowing more frequent environment changes. We compare EAT-C with RL/planning targeting similar problems and methods with environment generators or adversarial agents. Extensive experiments on diverse tasks demonstrate the advantages of our method on improving RL\u2019s efficiency and generalization.", "bibtex": "@InProceedings{pmlr-v162-ao22a,\n title = \t {{EAT}-C: Environment-Adversarial sub-Task Curriculum for Efficient Reinforcement Learning},\n author = {Ao, Shuang and Zhou, Tianyi and Jiang, Jing and Long, Guodong and Song, Xuan and Zhang, Chengqi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {822--843},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ao22a/ao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ao22a.html},\n abstract = \t {Reinforcement learning (RL) is inefficient on long-horizon tasks due to sparse rewards and its policy can be fragile to slightly perturbed environments. We address these challenges via a curriculum of tasks with coupled environments, generated by two policies trained jointly with RL: (1) a co-operative planning policy recursively decomposing a hard task into a coarse-to-fine sub-task tree; and (2) an adversarial policy modifying the environment in each sub-task. They are complementary to acquire more informative feedback for RL: (1) provides dense reward of easier sub-tasks while (2) modifies sub-tasks\u2019 environments to be more challenging and diverse. Conversely, they are trained by RL\u2019s dense feedback on sub-tasks so their generated curriculum keeps adaptive to RL\u2019s progress. The sub-task tree enables an easy-to-hard curriculum for every policy: its top-down construction gradually increases sub-tasks the planner needs to generate, while the adversarial training between the environment and RL follows a bottom-up traversal that starts from a dense sequence of easier sub-tasks allowing more frequent environment changes. We compare EAT-C with RL/planning targeting similar problems and methods with environment generators or adversarial agents. Extensive experiments on diverse tasks demonstrate the advantages of our method on improving RL\u2019s efficiency and generalization.}\n}", "pdf": "https://proceedings.mlr.press/v162/ao22a/ao22a.pdf", "supp": "", "pdf_size": 2099739, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10853735307068062859&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/ao22a.html" }, { "title": "EDEN: Communication-Efficient and Robust Distributed Mean Estimation for Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17679", "id": "17679", "proceeding": "https://proceedings.mlr.press/v162/vargaftik22a.html", "poster": "/media/PosterPDFs/ICML%202022/a381c2c35c9157f6b67fd07d5a200ae1_fEKaMPf.png?t=1657463498.1359527", "slides": "", "author_site": "Shay Vargaftik, Ran Ben Basat, Amit Portnoy, Gal Mendelson, Yaniv Ben Itzhak, Michael Mitzenmacher", "author": "Shay Vargaftik; Ran Ben Basat; Amit Portnoy; Gal Mendelson; Yaniv Ben Itzhak; Michael Mitzenmacher", "abstract": "Distributed Mean Estimation (DME) is a central building block in federated learning, where clients send local gradients to a parameter server for averaging and updating the model. Due to communication constraints, clients often use lossy compression techniques to compress the gradients, resulting in estimation inaccuracies. DME is more challenging when clients have diverse network conditions, such as constrained communication budgets and packet losses. In such settings, DME techniques often incur a significant increase in the estimation error leading to degraded learning performance. In this work, we propose a robust DME technique named EDEN that naturally handles heterogeneous communication budgets and packet losses. We derive appealing theoretical guarantees for EDEN and evaluate it empirically. Our results demonstrate that EDEN consistently improves over state-of-the-art DME techniques.", "bibtex": "@InProceedings{pmlr-v162-vargaftik22a,\n title = \t {{EDEN}: Communication-Efficient and Robust Distributed Mean Estimation for Federated Learning},\n author = {Vargaftik, Shay and Basat, Ran Ben and Portnoy, Amit and Mendelson, Gal and Itzhak, Yaniv Ben and Mitzenmacher, Michael},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21984--22014},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vargaftik22a/vargaftik22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/vargaftik22a.html},\n abstract = \t {Distributed Mean Estimation (DME) is a central building block in federated learning, where clients send local gradients to a parameter server for averaging and updating the model. Due to communication constraints, clients often use lossy compression techniques to compress the gradients, resulting in estimation inaccuracies. DME is more challenging when clients have diverse network conditions, such as constrained communication budgets and packet losses. In such settings, DME techniques often incur a significant increase in the estimation error leading to degraded learning performance. In this work, we propose a robust DME technique named EDEN that naturally handles heterogeneous communication budgets and packet losses. We derive appealing theoretical guarantees for EDEN and evaluate it empirically. Our results demonstrate that EDEN consistently improves over state-of-the-art DME techniques.}\n}", "pdf": "https://proceedings.mlr.press/v162/vargaftik22a/vargaftik22a.pdf", "supp": "", "pdf_size": 1317444, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3209500586717789200&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "VMware Research; University College London; Ben-Gurion University; Stanford University; VMware Research; Harvard University", "aff_domain": "; ; ; ; ; ", "email": "; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/vargaftik22a.html", "aff_unique_index": "0;1;2;3;0;4", "aff_unique_norm": "VMware, Inc.;University College London;Ben-Gurion University of the Negev;Stanford University;Harvard University", "aff_unique_dep": "VMware Research;;;;", "aff_unique_url": "https://www.vmware.com/research.html;https://www.ucl.ac.uk;https://www.bgu.ac.il;https://www.stanford.edu;https://www.harvard.edu", "aff_unique_abbr": "VMware;UCL;BGU;Stanford;Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;2;0;0;0", "aff_country_unique": "United States;United Kingdom;Israel" }, { "title": "Easy Variational Inference for Categorical Models via an Independent Binary Approximation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16917", "id": "16917", "proceeding": "https://proceedings.mlr.press/v162/wojnowicz22a.html", "poster": "/media/PosterPDFs/ICML%202022/d94fd74dcde1aa553be72c1006578b23.png?t=1658199361.8535106", "slides": "", "author_site": "Michael Wojnowicz, Shuchin Aeron, Eric Miller, Michael Hughes", "author": "Michael T Wojnowicz; Shuchin Aeron; Eric L Miller; Michael Hughes", "abstract": "We pursue tractable Bayesian analysis of generalized linear models (GLMs) for categorical data. GLMs have been difficult to scale to more than a few dozen categories due to non-conjugacy or strong posterior dependencies when using conjugate auxiliary variable methods. We define a new class of GLMs for categorical data called categorical-from-binary (CB) models. Each CB model has a likelihood that is bounded by the product of binary likelihoods, suggesting a natural posterior approximation. This approximation makes inference straightforward and fast; using well-known auxiliary variables for probit or logistic regression, the product of binary models admits conjugate closed-form variational inference that is embarrassingly parallel across categories and invariant to category ordering. Moreover, an independent binary model simultaneously approximates multiple CB models. Bayesian model averaging over these can improve the quality of the approximation for any given dataset. We show that our approach scales to thousands of categories, outperforming posterior estimation competitors like Automatic Differentiation Variational Inference (ADVI) and No U-Turn Sampling (NUTS) in the time required to achieve fixed prediction quality.", "bibtex": "@InProceedings{pmlr-v162-wojnowicz22a,\n title = \t {Easy Variational Inference for Categorical Models via an Independent Binary Approximation},\n author = {Wojnowicz, Michael T and Aeron, Shuchin and Miller, Eric L and Hughes, Michael},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23857--23896},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wojnowicz22a/wojnowicz22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/wojnowicz22a.html},\n abstract = \t {We pursue tractable Bayesian analysis of generalized linear models (GLMs) for categorical data. GLMs have been difficult to scale to more than a few dozen categories due to non-conjugacy or strong posterior dependencies when using conjugate auxiliary variable methods. We define a new class of GLMs for categorical data called categorical-from-binary (CB) models. Each CB model has a likelihood that is bounded by the product of binary likelihoods, suggesting a natural posterior approximation. This approximation makes inference straightforward and fast; using well-known auxiliary variables for probit or logistic regression, the product of binary models admits conjugate closed-form variational inference that is embarrassingly parallel across categories and invariant to category ordering. Moreover, an independent binary model simultaneously approximates multiple CB models. Bayesian model averaging over these can improve the quality of the approximation for any given dataset. We show that our approach scales to thousands of categories, outperforming posterior estimation competitors like Automatic Differentiation Variational Inference (ADVI) and No U-Turn Sampling (NUTS) in the time required to achieve fixed prediction quality.}\n}", "pdf": "https://proceedings.mlr.press/v162/wojnowicz22a/wojnowicz22a.pdf", "supp": "", "pdf_size": 1749566, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13180457782658047792&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Tufts University, Medford, MA, USA+Data Intensive Studies Center+Dept. of Electrical and Computer Engineering; Tufts University, Medford, MA, USA+Data Intensive Studies Center+Dept. of Electrical and Computer Engineering; Tufts University, Medford, MA, USA+Data Intensive Studies Center+Dept. of Electrical and Computer Engineering; Tufts University, Medford, MA, USA+Dept. of Computer Science", "aff_domain": "tufts.edu; ; ; ", "email": "tufts.edu; ; ; ", "github": "github.com/tufts-ml/categorical-from-binary", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/wojnowicz22a.html", "aff_unique_index": "0+1+2;0+1+2;0+1+2;0+3", "aff_unique_norm": "Tufts University;Data Intensive Studies Center;University of California, Los Angeles;University Affiliation Not Specified", "aff_unique_dep": ";;Department of Electrical and Computer Engineering;Department of Computer Science", "aff_unique_url": "https://www.tufts.edu;;https://www.ucla.edu;", "aff_unique_abbr": "Tufts;;UCLA;", "aff_campus_unique_index": "0+2;0+2;0+2;0", "aff_campus_unique": "Medford;;Los Angeles", "aff_country_unique_index": "0+0;0+0;0+0;0", "aff_country_unique": "United States;" }, { "title": "Efficient Approximate Inference for Stationary Kernel on Frequency Domain", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18381", "id": "18381", "proceeding": "https://proceedings.mlr.press/v162/jung22b.html", "poster": "/media/PosterPDFs/ICML%202022/3d9dabe52805a1ea21864b09f3397593.png?t=1657783527.558994", "slides": "", "author_site": "Yohan Jung, Kyungwoo Song, Jinkyoo Park", "author": "Yohan Jung; Kyungwoo Song; Jinkyoo Park", "abstract": "Based on the Fourier duality between a stationary kernel and its spectral density, modeling the spectral density using a Gaussian mixture density enables one to construct a flexible kernel, known as a Spectral Mixture kernel, that can model any stationary kernel. However, despite its expressive power, training this kernel is typically difficult because scalability and overfitting issues often arise due to a large number of training parameters. To resolve these issues, we propose an approximate inference method for estimating the Spectral mixture kernel hyperparameters. Specifically, we approximate this kernel by using the finite random spectral points based on Random Fourier Feature and optimize the parameters for the distribution of spectral points by sampling-based variational inference. To improve this inference procedure, we analyze the training loss and propose two special methods: a sampling method of spectral points to reduce the error of the approximate kernel in training, and an approximate natural gradient to accelerate the convergence of parameter inference.", "bibtex": "@InProceedings{pmlr-v162-jung22b,\n title = \t {Efficient Approximate Inference for Stationary Kernel on Frequency Domain},\n author = {Jung, Yohan and Song, Kyungwoo and Park, Jinkyoo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10502--10538},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jung22b/jung22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/jung22b.html},\n abstract = \t {Based on the Fourier duality between a stationary kernel and its spectral density, modeling the spectral density using a Gaussian mixture density enables one to construct a flexible kernel, known as a Spectral Mixture kernel, that can model any stationary kernel. However, despite its expressive power, training this kernel is typically difficult because scalability and overfitting issues often arise due to a large number of training parameters. To resolve these issues, we propose an approximate inference method for estimating the Spectral mixture kernel hyperparameters. Specifically, we approximate this kernel by using the finite random spectral points based on Random Fourier Feature and optimize the parameters for the distribution of spectral points by sampling-based variational inference. To improve this inference procedure, we analyze the training loss and propose two special methods: a sampling method of spectral points to reduce the error of the approximate kernel in training, and an approximate natural gradient to accelerate the convergence of parameter inference.}\n}", "pdf": "https://proceedings.mlr.press/v162/jung22b/jung22b.pdf", "supp": "", "pdf_size": 1868330, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9866053584774761232&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Industrial & Systems Engineering, Korea Advanced Institute of Science and Technology, Daejeon, South Korea; Department of Artificial Intelligence, University of Seoul, Seoul, South Korea; Department of Industrial & Systems Engineering, Korea Advanced Institute of Science and Technology, Daejeon, South Korea", "aff_domain": "kaist.ac.kr; ; ", "email": "kaist.ac.kr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/jung22b.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Seoul", "aff_unique_dep": "Department of Industrial & Systems Engineering;Department of Artificial Intelligence", "aff_unique_url": "https://www.kaist.ac.kr;http://www.seoultech.ac.kr", "aff_unique_abbr": "KAIST;UOS", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Daejeon;Seoul", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Efficient Computation of Higher-Order Subgraph Attribution via Message Passing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17545", "id": "17545", "proceeding": "https://proceedings.mlr.press/v162/xiong22a.html", "poster": "/media/PosterPDFs/ICML%202022/58f1e2bfc0c0c182f1afdab7cae02402_IGxSkE7.png?t=1657567995.80358", "slides": "", "author_site": "Ping Xiong, Thomas Schnake, Gr\u00e9goire Montavon, Klaus-robert Mueller, Shinichi Nakajima", "author": "Ping Xiong; Thomas Schnake; Gr\u00e9goire Montavon; Klaus-Robert M\u00fcller; Shinichi Nakajima", "abstract": "Explaining graph neural networks (GNNs) has become more and more important recently. Higher-order interpretation schemes, such as GNN-LRP (layer-wise relevance propagation for GNN), emerged as powerful tools for unraveling how different features interact thereby contributing to explaining GNNs. GNN-LRP gives a relevance attribution of walks between nodes at each layer, and the subgraph attribution is expressed as a sum over exponentially many such walks. In this work, we demonstrate that such exponential complexity can be avoided. In particular, we propose novel algorithms that enable to attribute subgraphs with GNN-LRP in linear-time (w.r.t. the network depth). Our algorithms are derived via message passing techniques that make use of the distributive property, thereby directly computing quantities for higher-order explanations. We further adapt our efficient algorithms to compute a generalization of subgraph attributions that also takes into account the neighboring graph features. Experimental results show the significant acceleration of the proposed algorithms and demonstrate the high usefulness and scalability of our novel generalized subgraph attribution method.", "bibtex": "@InProceedings{pmlr-v162-xiong22a,\n title = \t {Efficient Computation of Higher-Order Subgraph Attribution via Message Passing},\n author = {Xiong, Ping and Schnake, Thomas and Montavon, Gr{\\'e}goire and M{\\\"u}ller, Klaus-Robert and Nakajima, Shinichi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24478--24495},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xiong22a/xiong22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/xiong22a.html},\n abstract = \t {Explaining graph neural networks (GNNs) has become more and more important recently. Higher-order interpretation schemes, such as GNN-LRP (layer-wise relevance propagation for GNN), emerged as powerful tools for unraveling how different features interact thereby contributing to explaining GNNs. GNN-LRP gives a relevance attribution of walks between nodes at each layer, and the subgraph attribution is expressed as a sum over exponentially many such walks. In this work, we demonstrate that such exponential complexity can be avoided. In particular, we propose novel algorithms that enable to attribute subgraphs with GNN-LRP in linear-time (w.r.t. the network depth). Our algorithms are derived via message passing techniques that make use of the distributive property, thereby directly computing quantities for higher-order explanations. We further adapt our efficient algorithms to compute a generalization of subgraph attributions that also takes into account the neighboring graph features. Experimental results show the significant acceleration of the proposed algorithms and demonstrate the high usefulness and scalability of our novel generalized subgraph attribution method.}\n}", "pdf": "https://proceedings.mlr.press/v162/xiong22a/xiong22a.pdf", "supp": "", "pdf_size": 948024, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2963189861104141463&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Technische Universit \u00a8at Berlin (TU Berlin); Technische Universit \u00a8at Berlin (TU Berlin) + BIFOLD \u2013 Berlin Institute for the Foundations of Learning and Data; Technische Universit \u00a8at Berlin (TU Berlin) + BIFOLD \u2013 Berlin Institute for the Foundations of Learning and Data; Technische Universit \u00a8at Berlin (TU Berlin) + BIFOLD \u2013 Berlin Institute for the Foundations of Learning and Data + Department of Artificial Intelligence, Korea University, Seoul 136-713, Korea + Max Planck Institut f \u00a8ur Informatik, 66123 Saarbr \u00a8ucken, Germany; Technische Universit \u00a8at Berlin (TU Berlin) + BIFOLD \u2013 Berlin Institute for the Foundations of Learning and Data + RIKEN Center for AIP, Japan", "aff_domain": "tu-berlin.de; ; ; ;tu-berlin.de", "email": "tu-berlin.de; ; ; ;tu-berlin.de", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/xiong22a.html", "aff_unique_index": "0;0+1;0+1;0+1+2+3;0+1+4", "aff_unique_norm": "Technische Universit\ufffdt Berlin;Berlin Institute for the Foundations of Learning and Data;Korea University;Max Planck Institute for Informatics;RIKEN Center for AIP", "aff_unique_dep": ";;Department of Artificial Intelligence;Institute for Informatics;", "aff_unique_url": "https://www.tu-berlin.de;https://www.bifold.berlin;http://www.korea.ac.kr;https://mpi-sws.org;https://aipcenter.riken.jp/en/", "aff_unique_abbr": "TU Berlin;BIFOLD;KU;MPII;Riken AIP", "aff_campus_unique_index": "0;0;0;0+2;0", "aff_campus_unique": "Berlin;;Seoul", "aff_country_unique_index": "0;0+0;0+0;0+0+1+0;0+0+2", "aff_country_unique": "Germany;South Korea;Japan" }, { "title": "Efficient Distributionally Robust Bayesian Optimization with Worst-case Sensitivity", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17879", "id": "17879", "proceeding": "https://proceedings.mlr.press/v162/tay22a.html", "poster": "/media/PosterPDFs/ICML%202022/42fe880812925e520249e808937738d2_3ruKupu.png?t=1657593261.8232787", "slides": "", "author_site": "Sebastian Tay, Chuan Sheng Foo, Urano Daisuke, Richalynn Leong, Bryan Kian Hsiang Low", "author": "Sebastian Shenghong Tay; Chuan Sheng Foo; Urano Daisuke; Richalynn Leong; Bryan Kian Hsiang Low", "abstract": "In distributionally robust Bayesian optimization (DRBO), an exact computation of the worst-case expected value requires solving an expensive convex optimization problem. We develop a fast approximation of the worst-case expected value based on the notion of worst-case sensitivity that caters to arbitrary convex distribution distances. We provide a regret bound for our novel DRBO algorithm with the fast approximation, and empirically show it is competitive with that using the exact worst-case expected value while incurring significantly less computation time. In order to guide the choice of distribution distance to be used with DRBO, we show that our approximation implicitly optimizes an objective close to an interpretable risk-sensitive value.", "bibtex": "@InProceedings{pmlr-v162-tay22a,\n title = \t {Efficient Distributionally Robust {B}ayesian Optimization with Worst-case Sensitivity},\n author = {Tay, Sebastian Shenghong and Foo, Chuan Sheng and Daisuke, Urano and Leong, Richalynn and Low, Bryan Kian Hsiang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21180--21204},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tay22a/tay22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tay22a.html},\n abstract = \t {In distributionally robust Bayesian optimization (DRBO), an exact computation of the worst-case expected value requires solving an expensive convex optimization problem. We develop a fast approximation of the worst-case expected value based on the notion of worst-case sensitivity that caters to arbitrary convex distribution distances. We provide a regret bound for our novel DRBO algorithm with the fast approximation, and empirically show it is competitive with that using the exact worst-case expected value while incurring significantly less computation time. In order to guide the choice of distribution distance to be used with DRBO, we show that our approximation implicitly optimizes an objective close to an interpretable risk-sensitive value.}\n}", "pdf": "https://proceedings.mlr.press/v162/tay22a/tay22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/tay22a-supp.zip", "pdf_size": 3986273, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8043667741079567960&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, National University of Singapore, Singapore+Institute for Infocomm Research, A*STAR, Singapore; Institute for Infocomm Research, A*STAR, Singapore; Temasek Life Sciences Laboratory, Singapore; Temasek Life Sciences Laboratory, Singapore; Department of Computer Science, National University of Singapore, Singapore", "aff_domain": "gmail.com; ; ; ; ", "email": "gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/tay22a.html", "aff_unique_index": "0+1;1;2;2;0", "aff_unique_norm": "National University of Singapore;Institute for Infocomm Research;Temasek Life Sciences Laboratory", "aff_unique_dep": "Department of Computer Science;;Life Sciences", "aff_unique_url": "https://www.nus.edu.sg;https://www.i2r.a-star.edu.sg;", "aff_unique_abbr": "NUS;I2R;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "Singapore" }, { "title": "Efficient Learning for AlphaZero via Path Consistency", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18015", "id": "18015", "proceeding": "https://proceedings.mlr.press/v162/zhao22h.html", "poster": "/media/PosterPDFs/ICML%202022/7f018eb7b301a66658931cb8a93fd6e8.png?t=1657434407.1389127", "slides": "", "author_site": "Dengwei Zhao, Shikui Tu, Lei Xu", "author": "Dengwei Zhao; Shikui Tu; Lei Xu", "abstract": "In recent years, deep reinforcement learning have made great breakthroughs on board games. Still, most of the works require huge computational resources for a large scale of environmental interactions or self-play for the games. This paper aims at building powerful models under a limited amount of self-plays which can be utilized by a human throughout the lifetime. We proposes a learning algorithm built on AlphaZero, with its path searching regularised by a path consistency (PC) optimality, i.e., values on one optimal search path should be identical. Thus, the algorithm is shortly named PCZero. In implementation, historical trajectory and scouted search paths by MCTS makes a good balance between exploration and exploitation, which enhances the generalization ability effectively. PCZero obtains $94.1%$ winning rate against the champion of Hex Computer Olympiad in 2015 on $13\\times 13$ Hex, much higher than $84.3%$ by AlphaZero. The models consume only $900K$ self-play games, about the amount humans can study in a lifetime. The improvements by PCZero have been also generalized to Othello and Gomoku. Experiments also demonstrate the efficiency of PCZero under offline learning setting.", "bibtex": "@InProceedings{pmlr-v162-zhao22h,\n title = \t {Efficient Learning for {A}lpha{Z}ero via Path Consistency},\n author = {Zhao, Dengwei and Tu, Shikui and Xu, Lei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26971--26981},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhao22h/zhao22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhao22h.html},\n abstract = \t {In recent years, deep reinforcement learning have made great breakthroughs on board games. Still, most of the works require huge computational resources for a large scale of environmental interactions or self-play for the games. This paper aims at building powerful models under a limited amount of self-plays which can be utilized by a human throughout the lifetime. We proposes a learning algorithm built on AlphaZero, with its path searching regularised by a path consistency (PC) optimality, i.e., values on one optimal search path should be identical. Thus, the algorithm is shortly named PCZero. In implementation, historical trajectory and scouted search paths by MCTS makes a good balance between exploration and exploitation, which enhances the generalization ability effectively. PCZero obtains $94.1%$ winning rate against the champion of Hex Computer Olympiad in 2015 on $13\\times 13$ Hex, much higher than $84.3%$ by AlphaZero. The models consume only $900K$ self-play games, about the amount humans can study in a lifetime. The improvements by PCZero have been also generalized to Othello and Gomoku. Experiments also demonstrate the efficiency of PCZero under offline learning setting.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhao22h/zhao22h.pdf", "supp": "", "pdf_size": 625264, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7398038262532176323&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China; Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China; Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn; ", "email": "sjtu.edu.cn;sjtu.edu.cn; ", "github": "https://github.com/CMACH508/PCZero", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhao22h.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Efficient Learning of CNNs using Patch Based Features", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17341", "id": "17341", "proceeding": "https://proceedings.mlr.press/v162/brutzkus22a.html", "poster": "/media/PosterPDFs/ICML%202022/9a83eabfb7fa303a2d85dbc6f37483e5.png?t=1657555827.8735175", "slides": "", "author_site": "Alon Brutzkus, Amir Globerson, Eran Malach, Alon Regev Netser, Shai Shalev-Shwartz", "author": "Alon Brutzkus; Amir Globerson; Eran Malach; Alon Regev Netser; Shai Shalev-Schwartz", "abstract": "Recent work has demonstrated the effectiveness of using patch based representations when learning from image data. Here we provide theoretical support for this observation, by showing that a simple semi-supervised algorithm that uses patch statistics can efficiently learn labels produced by a one-hidden-layer Convolutional Neural Network (CNN). Since CNNs are known to be computationally hard to learn in the worst case, our analysis holds under some distributional assumptions. We show that these assumptions are necessary and sufficient for our results to hold. We verify that the distributional assumptions hold on real-world data by experimenting on the CIFAR-10 dataset, and find that the analyzed algorithm outperforms a vanilla one-hidden-layer CNN. Finally, we demonstrate that by running the algorithm in a layer-by-layer fashion we can build a deep model which gives further improvements, hinting that this method provides insights about the behavior of deep CNNs.", "bibtex": "@InProceedings{pmlr-v162-brutzkus22a,\n title = \t {Efficient Learning of {CNN}s using Patch Based Features},\n author = {Brutzkus, Alon and Globerson, Amir and Malach, Eran and Netser, Alon Regev and Shalev-Schwartz, Shai},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2336--2356},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/brutzkus22a/brutzkus22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/brutzkus22a.html},\n abstract = \t {Recent work has demonstrated the effectiveness of using patch based representations when learning from image data. Here we provide theoretical support for this observation, by showing that a simple semi-supervised algorithm that uses patch statistics can efficiently learn labels produced by a one-hidden-layer Convolutional Neural Network (CNN). Since CNNs are known to be computationally hard to learn in the worst case, our analysis holds under some distributional assumptions. We show that these assumptions are necessary and sufficient for our results to hold. We verify that the distributional assumptions hold on real-world data by experimenting on the CIFAR-10 dataset, and find that the analyzed algorithm outperforms a vanilla one-hidden-layer CNN. Finally, we demonstrate that by running the algorithm in a layer-by-layer fashion we can build a deep model which gives further improvements, hinting that this method provides insights about the behavior of deep CNNs.}\n}", "pdf": "https://proceedings.mlr.press/v162/brutzkus22a/brutzkus22a.pdf", "supp": "", "pdf_size": 2089901, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6420636522525728767&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Blavatnik School of Computer Science, Tel Aviv University, Israel; Blavatnik School of Computer Science, Tel Aviv University, Israel; School of Computer Science, The Hebrew University of Jerusalem, Israel; School of Computer Science, The Hebrew University of Jerusalem, Israel; School of Computer Science, The Hebrew University of Jerusalem, Israel", "aff_domain": "tau.ac.il;tau.ac.il;mail.huji.ac.il;mail.huji.ac.il;cs.huji.ac.il", "email": "tau.ac.il;tau.ac.il;mail.huji.ac.il;mail.huji.ac.il;cs.huji.ac.il", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/brutzkus22a.html", "aff_unique_index": "0;0;1;1;1", "aff_unique_norm": "Tel Aviv University;Hebrew University of Jerusalem", "aff_unique_dep": "Blavatnik School of Computer Science;School of Computer Science", "aff_unique_url": "https://www.tau.ac.il;http://www.huji.ac.il", "aff_unique_abbr": "TAU;HUJI", "aff_campus_unique_index": "0;0;1;1;1", "aff_campus_unique": "Tel Aviv;Jerusalem", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Israel" }, { "title": "Efficient Low Rank Convex Bounds for Pairwise Discrete Graphical Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17753", "id": "17753", "proceeding": "https://proceedings.mlr.press/v162/durante22a.html", "poster": "/media/PosterPDFs/ICML%202022/c0356641f421b381e475776b602a5da8.png?t=1657642538.3810523", "slides": "/media/icml-2022/Slides/17753.pdf", "author_site": "Valentin Durante, George Katsirelos, Thomas Schiex", "author": "Valentin Durante; George Katsirelos; Thomas Schiex", "abstract": "In this paper, we extend a Burer-Monteiro style method to compute low rank Semi-Definite Programming (SDP) bounds for the MAP problem on discrete graphical models with an arbitrary number of states and arbitrary pairwise potentials. We consider both a penalized constraint approach and a dedicated Block Coordinate Descent (BCD) approach which avoids large penalty coefficients in the cost matrix. We show our algorithm is decreasing. Experiments show that the BCD approach compares favorably to the penalized approach and to usual linear bounds relying on convergent message passing approaches.", "bibtex": "@InProceedings{pmlr-v162-durante22a,\n title = \t {Efficient Low Rank Convex Bounds for Pairwise Discrete Graphical Models},\n author = {Durante, Valentin and Katsirelos, George and Schiex, Thomas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5726--5741},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/durante22a/durante22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/durante22a.html},\n abstract = \t {In this paper, we extend a Burer-Monteiro style method to compute low rank Semi-Definite Programming (SDP) bounds for the MAP problem on discrete graphical models with an arbitrary number of states and arbitrary pairwise potentials. We consider both a penalized constraint approach and a dedicated Block Coordinate Descent (BCD) approach which avoids large penalty coefficients in the cost matrix. We show our algorithm is decreasing. Experiments show that the BCD approach compares favorably to the penalized approach and to usual linear bounds relying on convergent message passing approaches.}\n}", "pdf": "https://proceedings.mlr.press/v162/durante22a/durante22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/durante22a-supp.zip", "pdf_size": 503852, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16254812317655783736&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Universit \u00b4e F\u00b4ed\u00b4erale de Toulouse, ANITI, INRAE, UR 875, 31326 Toulouse, France; MIA-Paris-Math \u00b4ematiques et Informa-tique Appliqu \u00b4ees, INRAE, 75231 Paris, France; Universit \u00b4e F\u00b4ed\u00b4erale de Toulouse, ANITI, INRAE, UR 875, 31326 Toulouse, France", "aff_domain": "inrae.fr; ;inrae.fr", "email": "inrae.fr; ;inrae.fr", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/durante22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Universit\u00e9 F\u00e9d\u00e9rale de Toulouse;INRAE", "aff_unique_dep": "ANITI, INRAE, UR 875;MIA-Paris-Mathematiques et Informatique Appliquees", "aff_unique_url": ";https://www.inrae.fr", "aff_unique_abbr": ";INRAE", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Toulouse;Paris", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Efficient Model-based Multi-agent Reinforcement Learning via Optimistic Equilibrium Computation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17345", "id": "17345", "proceeding": "https://proceedings.mlr.press/v162/sessa22a.html", "poster": "/media/PosterPDFs/ICML%202022/05f17e3cfa5de42020eaa6df34fb4805.png?t=1657659216.93548", "slides": "/media/icml-2022/Slides/17345.pdf", "author_site": "Pier Giuseppe Sessa, Maryam Kamgarpour, Andreas Krause", "author": "Pier Giuseppe Sessa; Maryam Kamgarpour; Andreas Krause", "abstract": "We consider model-based multi-agent reinforcement learning, where the environment transition model is unknown and can only be learned via expensive interactions with the environment. We propose H-MARL (Hallucinated Multi-Agent Reinforcement Learning), a novel sample-efficient algorithm that can efficiently balance exploration, i.e., learning about the environment, and exploitation, i.e., achieve good equilibrium performance in the underlying general-sum Markov game. H-MARL builds high-probability confidence intervals around the unknown transition model and sequentially updates them based on newly observed data. Using these, it constructs an optimistic hallucinated game for the agents for which equilibrium policies are computed at each round. We consider general statistical models (e.g., Gaussian processes, deep ensembles, etc.) and policy classes (e.g., deep neural networks), and theoretically analyze our approach by bounding the agents\u2019 dynamic regret. Moreover, we provide a convergence rate to the equilibria of the underlying Markov game. We demonstrate our approach experimentally on an autonomous driving simulation benchmark. H-MARL learns successful equilibrium policies after a few interactions with the environment and can significantly improve the performance compared to non-optimistic exploration methods.", "bibtex": "@InProceedings{pmlr-v162-sessa22a,\n title = \t {Efficient Model-based Multi-agent Reinforcement Learning via Optimistic Equilibrium Computation},\n author = {Sessa, Pier Giuseppe and Kamgarpour, Maryam and Krause, Andreas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19580--19597},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sessa22a/sessa22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sessa22a.html},\n abstract = \t {We consider model-based multi-agent reinforcement learning, where the environment transition model is unknown and can only be learned via expensive interactions with the environment. We propose H-MARL (Hallucinated Multi-Agent Reinforcement Learning), a novel sample-efficient algorithm that can efficiently balance exploration, i.e., learning about the environment, and exploitation, i.e., achieve good equilibrium performance in the underlying general-sum Markov game. H-MARL builds high-probability confidence intervals around the unknown transition model and sequentially updates them based on newly observed data. Using these, it constructs an optimistic hallucinated game for the agents for which equilibrium policies are computed at each round. We consider general statistical models (e.g., Gaussian processes, deep ensembles, etc.) and policy classes (e.g., deep neural networks), and theoretically analyze our approach by bounding the agents\u2019 dynamic regret. Moreover, we provide a convergence rate to the equilibria of the underlying Markov game. We demonstrate our approach experimentally on an autonomous driving simulation benchmark. H-MARL learns successful equilibrium policies after a few interactions with the environment and can significantly improve the performance compared to non-optimistic exploration methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/sessa22a/sessa22a.pdf", "supp": "", "pdf_size": 917567, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3902738644597941839&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "ETH Z\u00fcrich; EPFL Lausanne; ETH Z\u00fcrich", "aff_domain": "ethz.ch; ; ", "email": "ethz.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sessa22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "ETH Zurich;EPFL", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.epfl.ch", "aff_unique_abbr": "ETHZ;EPFL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Efficient Online ML API Selection for Multi-Label Classification Tasks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16687", "id": "16687", "proceeding": "https://proceedings.mlr.press/v162/chen22ad.html", "poster": "/media/PosterPDFs/ICML%202022/799de6d3dae4c924142cf245a1d7f703.png?t=1657741054.5436072", "slides": "", "author_site": "Lingjiao Chen, Matei Zaharia, James Zou", "author": "Lingjiao Chen; Matei Zaharia; James Zou", "abstract": "Multi-label classification tasks such as OCR and multi-object recognition are a major focus of the growing machine learning as a service industry. While many multi-label APIs are available, it is challenging for users to decide which API to use for their own data and budget, due to the heterogeneity in their prices and performance. Recent work has shown how to efficiently select and combine single label APIs to optimize performance and cost. However, its computation cost is exponential in the number of labels, and is not suitable for settings like OCR. In this work, we propose FrugalMCT, a principled framework that adaptively selects the APIs to use for different data in an online fashion while respecting the user\u2019s budget. It allows combining ML APIs\u2019 predictions for any single data point, and selects the best combination based on an accuracy estimator. We run systematic experiments using ML APIs from Google, Microsoft, Amazon, IBM, Tencent, and other providers for tasks including multi-label image classification, scene text recognition, and named entity recognition. Across these tasks, FrugalMCT can achieve over 90% cost reduction while matching the accuracy of the best single API, or up to 8% better accuracy while matching the best API\u2019s cost.", "bibtex": "@InProceedings{pmlr-v162-chen22ad,\n title = \t {Efficient Online {ML} {API} Selection for Multi-Label Classification Tasks},\n author = {Chen, Lingjiao and Zaharia, Matei and Zou, James},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3716--3746},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22ad/chen22ad.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22ad.html},\n abstract = \t {Multi-label classification tasks such as OCR and multi-object recognition are a major focus of the growing machine learning as a service industry. While many multi-label APIs are available, it is challenging for users to decide which API to use for their own data and budget, due to the heterogeneity in their prices and performance. Recent work has shown how to efficiently select and combine single label APIs to optimize performance and cost. However, its computation cost is exponential in the number of labels, and is not suitable for settings like OCR. In this work, we propose FrugalMCT, a principled framework that adaptively selects the APIs to use for different data in an online fashion while respecting the user\u2019s budget. It allows combining ML APIs\u2019 predictions for any single data point, and selects the best combination based on an accuracy estimator. We run systematic experiments using ML APIs from Google, Microsoft, Amazon, IBM, Tencent, and other providers for tasks including multi-label image classification, scene text recognition, and named entity recognition. Across these tasks, FrugalMCT can achieve over 90% cost reduction while matching the accuracy of the best single API, or up to 8% better accuracy while matching the best API\u2019s cost.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22ad/chen22ad.pdf", "supp": "", "pdf_size": 2997548, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11325253642198235814&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Sciences, Stanford University; Department of Computer Sciences, Stanford University; Department of Computer Sciences, Stanford University + Department of Biomedical Data Science, Stanford University", "aff_domain": "stanford.edu; ; ", "email": "stanford.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22ad.html", "aff_unique_index": "0;0;0+0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Computer Sciences", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0+0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0+0", "aff_country_unique": "United States" }, { "title": "Efficient PAC Learning from the Crowd with Pairwise Comparisons", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16115", "id": "16115", "proceeding": "https://proceedings.mlr.press/v162/zeng22b.html", "poster": "/media/PosterPDFs/ICML%202022/68ce199ec2c5517597ce0a4d89620f55_pfQGRRR.png?t=1658759652.6999946", "slides": "", "author_site": "Shiwei Zeng, Jie Shen", "author": "Shiwei Zeng; Jie Shen", "abstract": "We study crowdsourced PAC learning of threshold function, where the labels are gathered from a pool of annotators some of whom may behave adversarially. This is yet a challenging problem and until recently has computationally and query efficient PAC learning algorithm been established by Awasthi et al. (2017). In this paper, we show that by leveraging the more easily acquired pairwise comparison queries, it is possible to exponentially reduce the label complexity while retaining the overall query complexity and runtime. Our main algorithmic contributions are a comparison-equipped labeling scheme that can faithfully recover the true labels of a small set of instances, and a label-efficient filtering process that in conjunction with the small labeled set can reliably infer the true labels of a large instance set.", "bibtex": "@InProceedings{pmlr-v162-zeng22b,\n title = \t {Efficient {PAC} Learning from the Crowd with Pairwise Comparisons},\n author = {Zeng, Shiwei and Shen, Jie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25973--25993},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zeng22b/zeng22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/zeng22b.html},\n abstract = \t {We study crowdsourced PAC learning of threshold function, where the labels are gathered from a pool of annotators some of whom may behave adversarially. This is yet a challenging problem and until recently has computationally and query efficient PAC learning algorithm been established by Awasthi et al. (2017). In this paper, we show that by leveraging the more easily acquired pairwise comparison queries, it is possible to exponentially reduce the label complexity while retaining the overall query complexity and runtime. Our main algorithmic contributions are a comparison-equipped labeling scheme that can faithfully recover the true labels of a small set of instances, and a label-efficient filtering process that in conjunction with the small labeled set can reliably infer the true labels of a large instance set.}\n}", "pdf": "https://proceedings.mlr.press/v162/zeng22b/zeng22b.pdf", "supp": "", "pdf_size": 415191, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8560723080915708530&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, Stevens Institute of Technology, Hoboken, New Jersey, USA; Department of Computer Science, Stevens Institute of Technology, Hoboken, New Jersey, USA", "aff_domain": "stevens.edu;stevens.edu", "email": "stevens.edu;stevens.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/zeng22b.html", "aff_unique_index": "0;0", "aff_unique_norm": "Stevens Institute of Technology", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.stevens.edu", "aff_unique_abbr": "SIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hoboken", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Efficient Reinforcement Learning in Block MDPs: A Model-free Representation Learning approach", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16623", "id": "16623", "proceeding": "https://proceedings.mlr.press/v162/zhang22aa.html", "poster": "", "slides": "", "author_site": "Xuezhou Zhang, Yuda Song, Masatoshi Uehara, Mengdi Wang, Alekh Agarwal, Wen Sun", "author": "Xuezhou Zhang; Yuda Song; Masatoshi Uehara; Mengdi Wang; Alekh Agarwal; Wen Sun", "abstract": "We present BRIEE, an algorithm for efficient reinforcement learning in Markov Decision Processes with block-structured dynamics (i.e., Block MDPs), where rich observations are generated from a set of unknown latent states. BRIEE interleaves latent states discovery, exploration, and exploitation together, and can provably learn a near-optimal policy with sample complexity scaling polynomially in the number of latent states, actions, and the time horizon, with no dependence on the size of the potentially infinite observation space. Empirically, we show that BRIEE is more sample efficient than the state-of-art Block MDP algorithm HOMER and other empirical RL baselines on challenging rich-observation combination lock problems which require deep exploration.", "bibtex": "@InProceedings{pmlr-v162-zhang22aa,\n title = \t {Efficient Reinforcement Learning in Block {MDP}s: A Model-free Representation Learning approach},\n author = {Zhang, Xuezhou and Song, Yuda and Uehara, Masatoshi and Wang, Mengdi and Agarwal, Alekh and Sun, Wen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26517--26547},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22aa/zhang22aa.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22aa.html},\n abstract = \t {We present BRIEE, an algorithm for efficient reinforcement learning in Markov Decision Processes with block-structured dynamics (i.e., Block MDPs), where rich observations are generated from a set of unknown latent states. BRIEE interleaves latent states discovery, exploration, and exploitation together, and can provably learn a near-optimal policy with sample complexity scaling polynomially in the number of latent states, actions, and the time horizon, with no dependence on the size of the potentially infinite observation space. Empirically, we show that BRIEE is more sample efficient than the state-of-art Block MDP algorithm HOMER and other empirical RL baselines on challenging rich-observation combination lock problems which require deep exploration.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22aa/zhang22aa.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/zhang22aa-supp.zip", "pdf_size": 927146, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10850889224658556483&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Princeton University; Carnegie Mellon University; Cornell University; Princeton University; Google Research; Cornell University", "aff_domain": "gmail.com; ; ; ; ;cornell.edu", "email": "gmail.com; ; ; ; ;cornell.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhang22aa.html", "aff_unique_index": "0;1;2;0;3;2", "aff_unique_norm": "Princeton University;Carnegie Mellon University;Cornell University;Google", "aff_unique_dep": ";;;Google Research", "aff_unique_url": "https://www.princeton.edu;https://www.cmu.edu;https://www.cornell.edu;https://research.google", "aff_unique_abbr": "Princeton;CMU;Cornell;Google Research", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Efficient Representation Learning via Adaptive Context Pooling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17009", "id": "17009", "proceeding": "https://proceedings.mlr.press/v162/huang22j.html", "poster": "/media/PosterPDFs/ICML%202022/12780ea688a71dabc284b064add459a4.png?t=1656975124.0616477", "slides": "/media/icml-2022/Slides/17009.pdf", "author_site": "Chen Huang, Walter Talbott, Navdeep Jaitly, Joshua M Susskind", "author": "Chen Huang; Walter Talbott; Navdeep Jaitly; Joshua M Susskind", "abstract": "Self-attention mechanisms model long-range context by using pairwise attention between all input tokens. In doing so, they assume a fixed attention granularity defined by the individual tokens (e.g., text characters or image pixels), which may not be optimal for modeling complex dependencies at higher levels. In this paper, we propose ContextPool to address this problem by adapting the attention granularity for each token. Inspired by the success of ConvNets that are combined with pooling to capture long-range dependencies, we learn to pool neighboring features for each token before computing attention in a given attention layer. The pooling weights and support size are adaptively determined, allowing the pooled features to encode meaningful context with varying scale. We show that ContextPool makes attention models more expressive, achieving strong performance often with fewer layers and thus significantly reduced cost. Experiments validate that our ContextPool module, when plugged into transformer models, matches or surpasses state-of-the-art performance using less compute on several language and image benchmarks, outperforms recent works with learned context sizes or sparse attention patterns, and is also applicable to ConvNets for efficient feature learning.", "bibtex": "@InProceedings{pmlr-v162-huang22j,\n title = \t {Efficient Representation Learning via Adaptive Context Pooling},\n author = {Huang, Chen and Talbott, Walter and Jaitly, Navdeep and Susskind, Joshua M},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9346--9355},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22j/huang22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22j.html},\n abstract = \t {Self-attention mechanisms model long-range context by using pairwise attention between all input tokens. In doing so, they assume a fixed attention granularity defined by the individual tokens (e.g., text characters or image pixels), which may not be optimal for modeling complex dependencies at higher levels. In this paper, we propose ContextPool to address this problem by adapting the attention granularity for each token. Inspired by the success of ConvNets that are combined with pooling to capture long-range dependencies, we learn to pool neighboring features for each token before computing attention in a given attention layer. The pooling weights and support size are adaptively determined, allowing the pooled features to encode meaningful context with varying scale. We show that ContextPool makes attention models more expressive, achieving strong performance often with fewer layers and thus significantly reduced cost. Experiments validate that our ContextPool module, when plugged into transformer models, matches or surpasses state-of-the-art performance using less compute on several language and image benchmarks, outperforms recent works with learned context sizes or sparse attention patterns, and is also applicable to ConvNets for efficient feature learning.}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22j/huang22j.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/huang22j-supp.zip", "pdf_size": 3671016, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9564819237029774355&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Apple Inc., Cupertino, United States; Apple Inc., Cupertino, United States; Apple Inc., Cupertino, United States; Apple Inc., Cupertino, United States", "aff_domain": "apple.com; ; ; ", "email": "apple.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/huang22j.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Apple", "aff_unique_dep": "Apple Inc.", "aff_unique_url": "https://www.apple.com", "aff_unique_abbr": "Apple", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cupertino", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Efficient Test-Time Model Adaptation without Forgetting", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18323", "id": "18323", "proceeding": "https://proceedings.mlr.press/v162/niu22a.html", "poster": "/media/PosterPDFs/ICML%202022/f33ba15effa5c10e873bf3842afb46a6_PCv9iDM.png?t=1657275767.581412", "slides": "/media/icml-2022/Slides/18323.pdf", "author_site": "Shuaicheng Niu, Jiaxiang Wu, Yifan Zhang, Yaofo Chen, Shijian Zheng, Peilin Zhao, Mingkui Tan", "author": "Shuaicheng Niu; Jiaxiang Wu; Yifan Zhang; Yaofo Chen; Shijian Zheng; Peilin Zhao; Mingkui Tan", "abstract": "Test-time adaptation provides an effective means of tackling the potential distribution shift between model training and inference, by dynamically updating the model at test time. This area has seen fast progress recently, at the effectiveness of handling test shifts. Nonetheless, prior methods still suffer two key limitations: 1) these methods rely on performing backward computation for each test sample, which takes a considerable amount of time; and 2) these methods focus on improving the performance on out-of-distribution test samples and ignore that the adaptation on test data may result in a catastrophic forgetting issue, \\ie, the performance on in-distribution test samples may degrade. To address these issues, we propose an efficient anti-forgetting test-time adaptation (EATA) method. Specifically, we devise a sample-efficient entropy minimization loss to exclude uninformative samples out of backward computation, which improves the overall efficiency and meanwhile boosts the out-of-distribution accuracy. Afterward, we introduce a regularization loss to ensure that critical model weights tend to be preserved during adaptation, thereby alleviating the forgetting issue. Extensive experiments on CIFAR-10-C, ImageNet-C, and ImageNet-R verify the effectiveness and superiority of our EATA.", "bibtex": "@InProceedings{pmlr-v162-niu22a,\n title = \t {Efficient Test-Time Model Adaptation without Forgetting},\n author = {Niu, Shuaicheng and Wu, Jiaxiang and Zhang, Yifan and Chen, Yaofo and Zheng, Shijian and Zhao, Peilin and Tan, Mingkui},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16888--16905},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/niu22a/niu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/niu22a.html},\n abstract = \t {Test-time adaptation provides an effective means of tackling the potential distribution shift between model training and inference, by dynamically updating the model at test time. This area has seen fast progress recently, at the effectiveness of handling test shifts. Nonetheless, prior methods still suffer two key limitations: 1) these methods rely on performing backward computation for each test sample, which takes a considerable amount of time; and 2) these methods focus on improving the performance on out-of-distribution test samples and ignore that the adaptation on test data may result in a catastrophic forgetting issue, \\ie, the performance on in-distribution test samples may degrade. To address these issues, we propose an efficient anti-forgetting test-time adaptation (EATA) method. Specifically, we devise a sample-efficient entropy minimization loss to exclude uninformative samples out of backward computation, which improves the overall efficiency and meanwhile boosts the out-of-distribution accuracy. Afterward, we introduce a regularization loss to ensure that critical model weights tend to be preserved during adaptation, thereby alleviating the forgetting issue. Extensive experiments on CIFAR-10-C, ImageNet-C, and ImageNet-R verify the effectiveness and superiority of our EATA.}\n}", "pdf": "https://proceedings.mlr.press/v162/niu22a/niu22a.pdf", "supp": "", "pdf_size": 818477, "gs_citation": 411, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17499416478096807711&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Software Engineering, South China University of Technology, China+Pazhou Laboratory, China; Tencent AI Lab, China; National University of Singapore, Singapore; School of Software Engineering, South China University of Technology, China; School of Software Engineering, South China University of Technology, China+Key Laboratory of Big Data and Intelligent Robot, Ministry of Education, China; Tencent AI Lab, China; School of Software Engineering, South China University of Technology, China+Key Laboratory of Big Data and Intelligent Robot, Ministry of Education, China", "aff_domain": "scut.edu.cn; ; ; ; ; ;scut.edu.cn", "email": "scut.edu.cn; ; ; ; ; ;scut.edu.cn", "github": "https://github.com/mr-eggplant/EATA", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/niu22a.html", "aff_unique_index": "0+1;2;3;0;0+4;2;0+4", "aff_unique_norm": "South China University of Technology;Pazhou Laboratory;Tencent;National University of Singapore;Key Laboratory of Big Data and Intelligent Robot", "aff_unique_dep": "School of Software Engineering;;Tencent AI Lab;;Ministry of Education", "aff_unique_url": "https://www.scut.edu.cn;;https://ai.tencent.com;https://www.nus.edu.sg;", "aff_unique_abbr": "SCUT;;Tencent AI Lab;NUS;", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;1;0;0+0;0;0+0", "aff_country_unique": "China;Singapore" }, { "title": "Efficient Variance Reduction for Meta-learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18047", "id": "18047", "proceeding": "https://proceedings.mlr.press/v162/yang22g.html", "poster": "/media/PosterPDFs/ICML%202022/3202111cf90e7c816a472aaceb72b0df.png?t=1658146669.7616842", "slides": "", "author_site": "Hansi Yang, James Kwok", "author": "Hansi Yang; James Kwok", "abstract": "Meta-learning tries to learn meta-knowledge from a large number of tasks. However, the stochastic meta-gradient can have large variance due to data sampling (from each task) and task sampling (from the whole task distribution), leading to slow convergence. In this paper, we propose a novel approach that integrates variance reduction with first-order meta-learning algorithms such as Reptile. It retains the bilevel formulation which better captures the structure of meta-learning, but does not require storing the vast number of task-specific parameters in general bilevel variance reduction methods. Theoretical results show that it has fast convergence rate due to variance reduction. Experiments on benchmark few-shot classification data sets demonstrate its effectiveness over state-of-the-art meta-learning algorithms with and without variance reduction.", "bibtex": "@InProceedings{pmlr-v162-yang22g,\n title = \t {Efficient Variance Reduction for Meta-learning},\n author = {Yang, Hansi and Kwok, James},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25070--25095},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22g/yang22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22g.html},\n abstract = \t {Meta-learning tries to learn meta-knowledge from a large number of tasks. However, the stochastic meta-gradient can have large variance due to data sampling (from each task) and task sampling (from the whole task distribution), leading to slow convergence. In this paper, we propose a novel approach that integrates variance reduction with first-order meta-learning algorithms such as Reptile. It retains the bilevel formulation which better captures the structure of meta-learning, but does not require storing the vast number of task-specific parameters in general bilevel variance reduction methods. Theoretical results show that it has fast convergence rate due to variance reduction. Experiments on benchmark few-shot classification data sets demonstrate its effectiveness over state-of-the-art meta-learning algorithms with and without variance reduction.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22g/yang22g.pdf", "supp": "", "pdf_size": 1000723, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17270097129242070255&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Computer Science and Engineering, The Hong Kong University of Science and Technology, Clear Water Bay, Hong Kong; Department of Computer Science and Engineering, The Hong Kong University of Science and Technology, Clear Water Bay, Hong Kong", "aff_domain": "cse.ust.hk;cse.ust.hk", "email": "cse.ust.hk;cse.ust.hk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/yang22g.html", "aff_unique_index": "0;0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Efficiently Learning the Topology and Behavior of a Networked Dynamical System Via Active Queries", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17583", "id": "17583", "proceeding": "https://proceedings.mlr.press/v162/rosenkrantz22a.html", "poster": "/media/PosterPDFs/ICML%202022/606c90a06173d69682feb83037a68fec.png?t=1657923987.3807309", "slides": "", "author_site": "Daniel Rosenkrantz, Abhijin Adiga, Madhav Marathe, Zirou Qiu, S. S. Ravi, Richard Stearns, Anil Vullikanti", "author": "Daniel J Rosenkrantz; Abhijin Adiga; Madhav Marathe; Zirou Qiu; S S Ravi; Richard Stearns; Anil Vullikanti", "abstract": "Using a discrete dynamical system model, many papers have addressed the problem of learning the behavior (i.e., the local function at each node) of a networked system through active queries, assuming that the network topology is known. We address the problem of inferring both the topology of the network and the behavior of a discrete dynamical system through active queries. We consider two query models studied in the literature, namely the batch model (where all the queries must be submitted together) and the adaptive model (where responses to previous queries can be used in formulating a new query). Our results are for systems where the state of each node is from {0,1} and the local functions are Boolean. We present algorithms to learn the topology and the behavior under both batch and adaptive query models for several classes of dynamical systems. These algorithms use only a polynomial number of queries. We also present experimental results obtained by running our query generation algorithms on synthetic and real-world networks.", "bibtex": "@InProceedings{pmlr-v162-rosenkrantz22a,\n title = \t {Efficiently Learning the Topology and Behavior of a Networked Dynamical System Via Active Queries},\n author = {Rosenkrantz, Daniel J and Adiga, Abhijin and Marathe, Madhav and Qiu, Zirou and Ravi, S S and Stearns, Richard and Vullikanti, Anil},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18796--18808},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rosenkrantz22a/rosenkrantz22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rosenkrantz22a.html},\n abstract = \t {Using a discrete dynamical system model, many papers have addressed the problem of learning the behavior (i.e., the local function at each node) of a networked system through active queries, assuming that the network topology is known. We address the problem of inferring both the topology of the network and the behavior of a discrete dynamical system through active queries. We consider two query models studied in the literature, namely the batch model (where all the queries must be submitted together) and the adaptive model (where responses to previous queries can be used in formulating a new query). Our results are for systems where the state of each node is from {0,1} and the local functions are Boolean. We present algorithms to learn the topology and the behavior under both batch and adaptive query models for several classes of dynamical systems. These algorithms use only a polynomial number of queries. We also present experimental results obtained by running our query generation algorithms on synthetic and real-world networks.}\n}", "pdf": "https://proceedings.mlr.press/v162/rosenkrantz22a/rosenkrantz22a.pdf", "supp": "", "pdf_size": 657976, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6934186206024538466&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Biocomplexity Institute and Initiative, University of Virginia, Charlottesville, VA, USA and Department of Computer Science, University at Albany \u2013 SUNY, Albany, NY, USA; Biocomplexity Institute and Initiative, University of Virginia, Charlottesville, VA, USA; Biocomplexity Institute and Initiative and Department of Computer Science, University of Virginia, Charlottesville, VA, USA; Biocomplexity Institute and Initiative and Department of Computer Science, University of Virginia, Charlottesville, VA, USA; Biocomplexity Institute and Initiative, University of Virginia, Charlottesville, VA, USA and Department of Computer Science, University at Albany \u2013 SUNY, Albany, NY, USA; Biocomplexity Institute and Initiative, University of Virginia, Charlottesville, VA, USA and Department of Computer Science, University at Albany \u2013 SUNY, Albany, NY, USA; Biocomplexity Institute and Initiative and Department of Computer Science, University of Virginia, Charlottesville, VA, USA", "aff_domain": "gmail.com;virginia.edu; ; ; ; ; ", "email": "gmail.com;virginia.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/rosenkrantz22a.html", "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "University of Virginia", "aff_unique_dep": "Biocomplexity Institute and Initiative", "aff_unique_url": "https://www.virginia.edu", "aff_unique_abbr": "UVA", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Charlottesville", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "End-to-End Balancing for Causal Continuous Treatment-Effect Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16557", "id": "16557", "proceeding": "https://proceedings.mlr.press/v162/bahadori22a.html", "poster": "/media/PosterPDFs/ICML%202022/9379c23ac12dc94053207373040bc791_aZEjM24.png?t=1657386064.2055924", "slides": "", "author_site": "Mohammad Taha Bahadori, Eric Tchetgen Tchetgen, David Heckerman", "author": "Taha Bahadori; Eric Tchetgen Tchetgen; David Heckerman", "abstract": "We study the problem of observational causal inference with continuous treatment. We focus on the challenge of estimating the causal response curve for infrequently-observed treatment values. We design a new algorithm based on the framework of entropy balancing which learns weights that directly maximize causal inference accuracy using end-to-end optimization. Our weights can be customized for different datasets and causal inference algorithms. We propose a new theory for consistency of entropy balancing for continuous treatments. Using synthetic and real-world data, we show that our proposed algorithm outperforms the entropy balancing in terms of causal inference accuracy.", "bibtex": "@InProceedings{pmlr-v162-bahadori22a,\n title = \t {End-to-End Balancing for Causal Continuous Treatment-Effect Estimation},\n author = {Bahadori, Taha and Tchetgen, Eric Tchetgen and Heckerman, David},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1313--1326},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bahadori22a/bahadori22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bahadori22a.html},\n abstract = \t {We study the problem of observational causal inference with continuous treatment. We focus on the challenge of estimating the causal response curve for infrequently-observed treatment values. We design a new algorithm based on the framework of entropy balancing which learns weights that directly maximize causal inference accuracy using end-to-end optimization. Our weights can be customized for different datasets and causal inference algorithms. We propose a new theory for consistency of entropy balancing for continuous treatments. Using synthetic and real-world data, we show that our proposed algorithm outperforms the entropy balancing in terms of causal inference accuracy.}\n}", "pdf": "https://proceedings.mlr.press/v162/bahadori22a/bahadori22a.pdf", "supp": "", "pdf_size": 712233, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2407767436720973513&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Amazon.com, Inc.; Amazon.com, Inc. + Wharton School of the University of Pennsylvania; Amazon.com, Inc.", "aff_domain": "amazon.com; ; ", "email": "amazon.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/bahadori22a.html", "aff_unique_index": "0;0+1;0", "aff_unique_norm": "Amazon;University of Pennsylvania", "aff_unique_dep": "Amazon.com, Inc.;Wharton School", "aff_unique_url": "https://www.amazon.com;https://www.wharton.upenn.edu", "aff_unique_abbr": "Amazon;Wharton", "aff_campus_unique_index": "1", "aff_campus_unique": ";Philadelphia", "aff_country_unique_index": "0;0+0;0", "aff_country_unique": "United States" }, { "title": "Entropic Causal Inference: Graph Identifiability", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16753", "id": "16753", "proceeding": "https://proceedings.mlr.press/v162/compton22a.html", "poster": "/media/PosterPDFs/ICML%202022/54843973f9b6670141731bbb71a02946.png?t=1658129847.0220938", "slides": "", "author_site": "Spencer Compton, Kristjan Greenewald, Dmitriy Katz, Murat Kocaoglu", "author": "Spencer Compton; Kristjan Greenewald; Dmitriy A Katz; Murat Kocaoglu", "abstract": "Entropic causal inference is a recent framework for learning the causal graph between two variables from observational data by finding the information-theoretically simplest structural explanation of the data, i.e., the model with smallest entropy. In our work, we first extend the causal graph identifiability result in the two-variable setting under relaxed assumptions. We then show the first identifiability result using the entropic approach for learning causal graphs with more than two nodes. Our approach utilizes the property that ancestrality between a source node and its descendants can be determined using the bivariate entropic tests. We provide a sound sequential peeling algorithm for general graphs that relies on this property. We also propose a heuristic algorithm for small graphs that shows strong empirical performance. We rigorously evaluate the performance of our algorithms on synthetic data generated from a variety of models, observing improvement over prior work. Finally we test our algorithms on real-world datasets.", "bibtex": "@InProceedings{pmlr-v162-compton22a,\n title = \t {Entropic Causal Inference: Graph Identifiability},\n author = {Compton, Spencer and Greenewald, Kristjan and Katz, Dmitriy A and Kocaoglu, Murat},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4311--4343},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/compton22a/compton22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/compton22a.html},\n abstract = \t {Entropic causal inference is a recent framework for learning the causal graph between two variables from observational data by finding the information-theoretically simplest structural explanation of the data, i.e., the model with smallest entropy. In our work, we first extend the causal graph identifiability result in the two-variable setting under relaxed assumptions. We then show the first identifiability result using the entropic approach for learning causal graphs with more than two nodes. Our approach utilizes the property that ancestrality between a source node and its descendants can be determined using the bivariate entropic tests. We provide a sound sequential peeling algorithm for general graphs that relies on this property. We also propose a heuristic algorithm for small graphs that shows strong empirical performance. We rigorously evaluate the performance of our algorithms on synthetic data generated from a variety of models, observing improvement over prior work. Finally we test our algorithms on real-world datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/compton22a/compton22a.pdf", "supp": "", "pdf_size": 1040988, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1793492775333106799&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Massachusetts Institute of Technology, Cambridge, USA+MIT-IBM Watson AI Lab, Cambridge, USA; MIT-IBM Watson AI Lab, Cambridge, USA+IBM Research, Cambridge, USA; MIT-IBM Watson AI Lab, Cambridge, USA+IBM Research, Cambridge, USA; Purdue University, West Lafayette, USA", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/compton22a.html", "aff_unique_index": "0+1;1+1;1+1;2", "aff_unique_norm": "Massachusetts Institute of Technology;IBM;Purdue University", "aff_unique_dep": ";AI Lab;", "aff_unique_url": "https://web.mit.edu;https://www.ibmwatsonai.org/;https://www.purdue.edu", "aff_unique_abbr": "MIT;MIT-IBM AI Lab;Purdue", "aff_campus_unique_index": "0+0;0+0;0+0;1", "aff_campus_unique": "Cambridge;West Lafayette", "aff_country_unique_index": "0+0;0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Entropic Gromov-Wasserstein between Gaussian Distributions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16669", "id": "16669", "proceeding": "https://proceedings.mlr.press/v162/le22a.html", "poster": "/media/PosterPDFs/ICML%202022/e1cd50f4a9880333cdad4a24ee9d550c.png?t=1657911833.033753", "slides": "", "author_site": "Khang Le, Dung Le, Huy Nguyen, , Tung Pham, Nhat Ho", "author": "Khang Le; Dung Q Le; Huy Nguyen; Dat Do; Tung Pham; Nhat Ho", "abstract": "We study the entropic Gromov-Wasserstein and its unbalanced version between (unbalanced) Gaussian distributions with different dimensions. When the metric is the inner product, which we refer to as inner product Gromov-Wasserstein (IGW), we demonstrate that the optimal transportation plans of entropic IGW and its unbalanced variant are (unbalanced) Gaussian distributions. Via an application of von Neumann\u2019s trace inequality, we obtain closed-form expressions for the entropic IGW between these Gaussian distributions. Finally, we consider an entropic inner product Gromov-Wasserstein barycenter of multiple Gaussian distributions. We prove that the barycenter is a Gaussian distribution when the entropic regularization parameter is small. We further derive a closed-form expression for the covariance matrix of the barycenter.", "bibtex": "@InProceedings{pmlr-v162-le22a,\n title = \t {Entropic Gromov-{W}asserstein between {G}aussian Distributions},\n author = {Le, Khang and Le, Dung Q and Nguyen, Huy and Do, Dat and Pham, Tung and Ho, Nhat},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12164--12203},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/le22a/le22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/le22a.html},\n abstract = \t {We study the entropic Gromov-Wasserstein and its unbalanced version between (unbalanced) Gaussian distributions with different dimensions. When the metric is the inner product, which we refer to as inner product Gromov-Wasserstein (IGW), we demonstrate that the optimal transportation plans of entropic IGW and its unbalanced variant are (unbalanced) Gaussian distributions. Via an application of von Neumann\u2019s trace inequality, we obtain closed-form expressions for the entropic IGW between these Gaussian distributions. Finally, we consider an entropic inner product Gromov-Wasserstein barycenter of multiple Gaussian distributions. We prove that the barycenter is a Gaussian distribution when the entropic regularization parameter is small. We further derive a closed-form expression for the covariance matrix of the barycenter.}\n}", "pdf": "https://proceedings.mlr.press/v162/le22a/le22a.pdf", "supp": "", "pdf_size": 731224, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3196789223810909860&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "University of Texas at Austin; \u00c9cole Polytechnique; VinAI Research; University of Michigan, Ann Arbor; VinAI Research; University of Texas at Austin", "aff_domain": "utexas.edu;vinai.io; ; ; ; ", "email": "utexas.edu;vinai.io; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/le22a.html", "aff_unique_index": "0;1;2;3;2;0", "aff_unique_norm": "University of Texas at Austin;Ecole Polytechnique;VinAI Research;University of Michigan", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.utexas.edu;https://www.polytechnique.edu;https://www.vinai.io/;https://www.umich.edu", "aff_unique_abbr": "UT Austin;X;VinAI;UM", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Austin;;Ann Arbor", "aff_country_unique_index": "0;1;2;0;2;0", "aff_country_unique": "United States;France;Vietnam" }, { "title": "EqR: Equivariant Representations for Data-Efficient Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16835", "id": "16835", "proceeding": "https://proceedings.mlr.press/v162/mondal22a.html", "poster": "/media/PosterPDFs/ICML%202022/ed77eab0b8ff85d0a6a8365df1846978_TZAUBbV.png?t=1657767003.1582673", "slides": "", "author_site": "Arnab Kumar Mondal, Vineet Jain, Kaleem Siddiqi, Siamak Ravanbakhsh", "author": "Arnab Kumar Mondal; Vineet Jain; Kaleem Siddiqi; Siamak Ravanbakhsh", "abstract": "We study a variety of notions of equivariance as an inductive bias in Reinforcement Learning (RL). In particular, we propose new mechanisms for learning representations that are equivariant to both the agent\u2019s action, as well as symmetry transformations of the state-action pairs. Whereas prior work on exploiting symmetries in deep RL can only incorporate predefined linear transformations, our approach allows non-linear symmetry transformations of state-action pairs to be learned from the data. This is achieved through 1) equivariant Lie algebraic parameterization of state and action encodings, 2) equivariant latent transition models, and 3) the incorporation of symmetry-based losses. We demonstrate the advantages of our method, which we call Equivariant representations for RL (EqR), for Atari games in a data-efficient setting limited to 100K steps of interactions with the environment.", "bibtex": "@InProceedings{pmlr-v162-mondal22a,\n title = \t {{E}q{R}: Equivariant Representations for Data-Efficient Reinforcement Learning},\n author = {Mondal, Arnab Kumar and Jain, Vineet and Siddiqi, Kaleem and Ravanbakhsh, Siamak},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15908--15926},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mondal22a/mondal22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mondal22a.html},\n abstract = \t {We study a variety of notions of equivariance as an inductive bias in Reinforcement Learning (RL). In particular, we propose new mechanisms for learning representations that are equivariant to both the agent\u2019s action, as well as symmetry transformations of the state-action pairs. Whereas prior work on exploiting symmetries in deep RL can only incorporate predefined linear transformations, our approach allows non-linear symmetry transformations of state-action pairs to be learned from the data. This is achieved through 1) equivariant Lie algebraic parameterization of state and action encodings, 2) equivariant latent transition models, and 3) the incorporation of symmetry-based losses. We demonstrate the advantages of our method, which we call Equivariant representations for RL (EqR), for Atari games in a data-efficient setting limited to 100K steps of interactions with the environment.}\n}", "pdf": "https://proceedings.mlr.press/v162/mondal22a/mondal22a.pdf", "supp": "", "pdf_size": 2775988, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15795267808339174229&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "School of Computer Science, McGill University, Montr \u00b4eal, Canada+Mila- Quebec Artificial Intelligence Institute, Montr \u00b4eal, Canada+Centre for Intelligent Machines, McGill University, Montr \u00b4eal, Canada; School of Computer Science, McGill University, Montr \u00b4eal, Canada+Mila- Quebec Artificial Intelligence Institute, Montr \u00b4eal, Canada; School of Computer Science, McGill University, Montr \u00b4eal, Canada+Mila- Quebec Artificial Intelligence Institute, Montr \u00b4eal, Canada+Centre for Intelligent Machines, McGill University, Montr \u00b4eal, Canada; School of Computer Science, McGill University, Montr \u00b4eal, Canada+Mila- Quebec Artificial Intelligence Institute, Montr \u00b4eal, Canada", "aff_domain": "mila.quebec; ; ; ", "email": "mila.quebec; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/mondal22a.html", "aff_unique_index": "0+1+0;0+1;0+1+0;0+1", "aff_unique_norm": "McGill University;Quebec Artificial Intelligence Institute", "aff_unique_dep": "School of Computer Science;Artificial Intelligence", "aff_unique_url": "https://www.mcgill.ca;https://mila.quebec", "aff_unique_abbr": "McGill;Mila", "aff_campus_unique_index": "0+0+0;0+0;0+0+0;0+0", "aff_campus_unique": "Montr\u00e9al", "aff_country_unique_index": "0+0+0;0+0;0+0+0;0+0", "aff_country_unique": "Canada" }, { "title": "EquiBind: Geometric Deep Learning for Drug Binding Structure Prediction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18361", "id": "18361", "proceeding": "https://proceedings.mlr.press/v162/stark22b.html", "poster": "/media/PosterPDFs/ICML%202022/6c1e55ec7c43dc51a37472ddcbd756fb.png?t=1657196685.8968596", "slides": "", "author_site": "Hannes St\u00e4rk, Octavian Ganea, Lagnajit Pattanaik, Regina Barzilay, Tommi Jaakkola", "author": "Hannes St\u00e4rk; Octavian Ganea; Lagnajit Pattanaik; Dr.Regina Barzilay; Tommi Jaakkola", "abstract": "Predicting how a drug-like molecule binds to a specific protein target is a core problem in drug discovery. An extremely fast computational binding method would enable key applications such as fast virtual screening or drug engineering. Existing methods are computationally expensive as they rely on heavy candidate sampling coupled with scoring, ranking, and fine-tuning steps. We challenge this paradigm with EquiBind, an SE(3)-equivariant geometric deep learning model performing direct-shot prediction of both i) the receptor binding location (blind docking) and ii) the ligand\u2019s bound pose and orientation. EquiBind achieves significant speed-ups and better quality compared to traditional and recent baselines. Further, we show extra improvements when coupling it with existing fine-tuning techniques at the cost of increased running time. Finally, we propose a novel and fast fine-tuning model that adjusts torsion angles of a ligand\u2019s rotatable bonds based on closed form global minima of the von Mises angular distance to a given input atomic point cloud, avoiding previous expensive differential evolution strategies for energy minimization.", "bibtex": "@InProceedings{pmlr-v162-stark22b,\n title = \t {{E}qui{B}ind: Geometric Deep Learning for Drug Binding Structure Prediction},\n author = {St{\\\"a}rk, Hannes and Ganea, Octavian and Pattanaik, Lagnajit and Barzilay, Dr.Regina and Jaakkola, Tommi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20503--20521},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/stark22b/stark22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/stark22b.html},\n abstract = \t {Predicting how a drug-like molecule binds to a specific protein target is a core problem in drug discovery. An extremely fast computational binding method would enable key applications such as fast virtual screening or drug engineering. Existing methods are computationally expensive as they rely on heavy candidate sampling coupled with scoring, ranking, and fine-tuning steps. We challenge this paradigm with EquiBind, an SE(3)-equivariant geometric deep learning model performing direct-shot prediction of both i) the receptor binding location (blind docking) and ii) the ligand\u2019s bound pose and orientation. EquiBind achieves significant speed-ups and better quality compared to traditional and recent baselines. Further, we show extra improvements when coupling it with existing fine-tuning techniques at the cost of increased running time. Finally, we propose a novel and fast fine-tuning model that adjusts torsion angles of a ligand\u2019s rotatable bonds based on closed form global minima of the von Mises angular distance to a given input atomic point cloud, avoiding previous expensive differential evolution strategies for energy minimization.}\n}", "pdf": "https://proceedings.mlr.press/v162/stark22b/stark22b.pdf", "supp": "", "pdf_size": 4626761, "gs_citation": 356, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2579310543705352041&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Massachusetts Institute of Technology, MIT, Cambridge, MA, USA; Massachusetts Institute of Technology, MIT, Cambridge, MA, USA; Massachusetts Institute of Technology, MIT, Cambridge, MA, USA; Massachusetts Institute of Technology, MIT, Cambridge, MA, USA; Massachusetts Institute of Technology, MIT, Cambridge, MA, USA", "aff_domain": "mit.edu; ; ; ; ", "email": "mit.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/stark22b.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Equivalence Analysis between Counterfactual Regret Minimization and Online Mirror Descent", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17863", "id": "17863", "proceeding": "https://proceedings.mlr.press/v162/liu22e.html", "poster": "/media/PosterPDFs/ICML%202022/aeefb050911334869a7a5d9e4d0e1689.png?t=1656334226.3133473", "slides": "/media/icml-2022/Slides/17863.pdf", "author_site": "Weiming Liu, Huacong Jiang, Bin Li, Houqiang Li", "author": "Weiming Liu; Huacong Jiang; Bin Li; Houqiang Li", "abstract": "Follow-the-Regularized-Leader (FTRL) and Online Mirror Descent (OMD) are regret minimization algorithms for Online Convex Optimization (OCO), they are mathematically elegant but less practical in solving Extensive-Form Games (EFGs). Counterfactual Regret Minimization (CFR) is a technique for approximating Nash equilibria in EFGs. CFR and its variants have a fast convergence rate in practice, but their theoretical results are not satisfactory. In recent years, researchers have been trying to link CFRs with OCO algorithms, which may provide new theoretical results and inspire new algorithms. However, existing analysis is restricted to local decision points. In this paper, we show that CFRs with Regret Matching and Regret Matching+ are equivalent to special cases of FTRL and OMD, respectively. According to these equivalences, a new FTRL and a new OMD algorithm, which can be considered as extensions of vanilla CFR and CFR+, are derived. The experimental results show that the two variants converge faster than conventional FTRL and OMD, even faster than vanilla CFR and CFR+ in some EFGs.", "bibtex": "@InProceedings{pmlr-v162-liu22e,\n title = \t {Equivalence Analysis between Counterfactual Regret Minimization and Online Mirror Descent},\n author = {Liu, Weiming and Jiang, Huacong and Li, Bin and Li, Houqiang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13717--13745},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22e/liu22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22e.html},\n abstract = \t {Follow-the-Regularized-Leader (FTRL) and Online Mirror Descent (OMD) are regret minimization algorithms for Online Convex Optimization (OCO), they are mathematically elegant but less practical in solving Extensive-Form Games (EFGs). Counterfactual Regret Minimization (CFR) is a technique for approximating Nash equilibria in EFGs. CFR and its variants have a fast convergence rate in practice, but their theoretical results are not satisfactory. In recent years, researchers have been trying to link CFRs with OCO algorithms, which may provide new theoretical results and inspire new algorithms. However, existing analysis is restricted to local decision points. In this paper, we show that CFRs with Regret Matching and Regret Matching+ are equivalent to special cases of FTRL and OMD, respectively. According to these equivalences, a new FTRL and a new OMD algorithm, which can be considered as extensions of vanilla CFR and CFR+, are derived. The experimental results show that the two variants converge faster than conventional FTRL and OMD, even faster than vanilla CFR and CFR+ in some EFGs.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22e/liu22e.pdf", "supp": "", "pdf_size": 1093379, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10017671981463022123&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "School of Data Science, University of Science and Technology of China, Hefei, China; Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China; Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China; Department of Electronic Engineering and Information Science, University of Science and Technology of China, Hefei, China", "aff_domain": "mail.ustc.edu.cn; ;ustc.edu.cn; ", "email": "mail.ustc.edu.cn; ;ustc.edu.cn; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/liu22e.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Science and Technology of China", "aff_unique_dep": "School of Data Science", "aff_unique_url": "http://www.ustc.edu.cn", "aff_unique_abbr": "USTC", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hefei", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Equivariance versus Augmentation for Spherical Images", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16211", "id": "16211", "proceeding": "https://proceedings.mlr.press/v162/gerken22a.html", "poster": "/media/PosterPDFs/ICML%202022/bee3d07327a21d8e7f02e10ba4b35c15.png?t=1657803043.2409825", "slides": "", "author_site": "Jan Gerken, Oscar Carlsson, Hampus Linander, Fredrik Ohlsson, Christoffer Petersson, Daniel Persson", "author": "Jan Gerken; Oscar Carlsson; Hampus Linander; Fredrik Ohlsson; Christoffer Petersson; Daniel Persson", "abstract": "We analyze the role of rotational equivariance in convolutional neural networks (CNNs) applied to spherical images. We compare the performance of the group equivariant networks known as S2CNNs and standard non-equivariant CNNs trained with an increasing amount of data augmentation. The chosen architectures can be considered baseline references for the respective design paradigms. Our models are trained and evaluated on single or multiple items from the MNIST- or FashionMNIST dataset projected onto the sphere. For the task of image classification, which is inherently rotationally invariant, we find that by considerably increasing the amount of data augmentation and the size of the networks, it is possible for the standard CNNs to reach at least the same performance as the equivariant network. In contrast, for the inherently equivariant task of semantic segmentation, the non-equivariant networks are consistently outperformed by the equivariant networks with significantly fewer parameters. We also analyze and compare the inference latency and training times of the different networks, enabling detailed tradeoff considerations between equivariant architectures and data augmentation for practical problems.", "bibtex": "@InProceedings{pmlr-v162-gerken22a,\n title = \t {Equivariance versus Augmentation for Spherical Images},\n author = {Gerken, Jan and Carlsson, Oscar and Linander, Hampus and Ohlsson, Fredrik and Petersson, Christoffer and Persson, Daniel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7404--7421},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gerken22a/gerken22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gerken22a.html},\n abstract = \t {We analyze the role of rotational equivariance in convolutional neural networks (CNNs) applied to spherical images. We compare the performance of the group equivariant networks known as S2CNNs and standard non-equivariant CNNs trained with an increasing amount of data augmentation. The chosen architectures can be considered baseline references for the respective design paradigms. Our models are trained and evaluated on single or multiple items from the MNIST- or FashionMNIST dataset projected onto the sphere. For the task of image classification, which is inherently rotationally invariant, we find that by considerably increasing the amount of data augmentation and the size of the networks, it is possible for the standard CNNs to reach at least the same performance as the equivariant network. In contrast, for the inherently equivariant task of semantic segmentation, the non-equivariant networks are consistently outperformed by the equivariant networks with significantly fewer parameters. We also analyze and compare the inference latency and training times of the different networks, enabling detailed tradeoff considerations between equivariant architectures and data augmentation for practical problems.}\n}", "pdf": "https://proceedings.mlr.press/v162/gerken22a/gerken22a.pdf", "supp": "", "pdf_size": 1218749, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2388075100052458630&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Mathematical Sciences, Chalmers University of Technology, Gothenburg, Sweden+Machine Learning Group at Berlin Institute of Technology, Berlin, Germany+Berlin Institute for the Foundations of Learning and Data (BIFOLD), Berlin, Germany; Department of Mathematical Sciences, Chalmers University of Technology, Gothenburg, Sweden; Department of Physics, University of Gothenburg, Gothenburg, Sweden; Department of Mathematics and Mathematical Statistics, Ume\u00e5 University, Ume\u00e5, Sweden; Zenseact, Gothenburg, Sweden; Department of Mathematical Sciences, Chalmers University of Technology, Gothenburg, Sweden", "aff_domain": "chalmers.se;chalmers.se;physics.gu.se;math.umu.se;zenseact.com;chalmers.se", "email": "chalmers.se;chalmers.se;physics.gu.se;math.umu.se;zenseact.com;chalmers.se", "github": "https://github.com/JanEGerken/sem_seg_s2cnn", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/gerken22a.html", "aff_unique_index": "0+1+2;0;3;4;5;0", "aff_unique_norm": "Chalmers University of Technology;Berlin Institute of Technology;Berlin Institute for the Foundations of Learning and Data;University of Gothenburg;Ume\u00e5 University;Zenseact", "aff_unique_dep": "Department of Mathematical Sciences;Machine Learning Group;;Department of Physics;Department of Mathematics and Mathematical Statistics;", "aff_unique_url": "https://www.chalmers.se;https://www.tu-berlin.de;;https://www.gu.se;https://www.umu.se;", "aff_unique_abbr": "Chalmers;TU Berlin;BIFOLD;GU;UMU;", "aff_campus_unique_index": "0+1+1;0;0;2;0;0", "aff_campus_unique": "Gothenburg;Berlin;Ume\u00e5", "aff_country_unique_index": "0+1+1;0;0;0;0;0", "aff_country_unique": "Sweden;Germany" }, { "title": "Equivariant Diffusion for Molecule Generation in 3D", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16253", "id": "16253", "proceeding": "https://proceedings.mlr.press/v162/hoogeboom22a.html", "poster": "/media/PosterPDFs/ICML%202022/0738069b244a1c43c83112b735140a16.png?t=1657568595.326601", "slides": "", "author_site": "Emiel Hoogeboom, Victor Garcia Satorras, Cl\u00e9ment Vignac, Max Welling", "author": "Emiel Hoogeboom; V\u0131\u0301ctor Garcia Satorras; Cl\u00e9ment Vignac; Max Welling", "abstract": "This work introduces a diffusion model for molecule generation in 3D that is equivariant to Euclidean transformations. Our E(3) Equivariant Diffusion Model (EDM) learns to denoise a diffusion process with an equivariant network that jointly operates on both continuous (atom coordinates) and categorical features (atom types). In addition, we provide a probabilistic analysis which admits likelihood computation of molecules using our model. Experimentally, the proposed method significantly outperforms previous 3D molecular generative methods regarding the quality of generated samples and the efficiency at training time.", "bibtex": "@InProceedings{pmlr-v162-hoogeboom22a,\n title = \t {Equivariant Diffusion for Molecule Generation in 3{D}},\n author = {Hoogeboom, Emiel and Satorras, V\\'{\\i}ctor Garcia and Vignac, Cl{\\'e}ment and Welling, Max},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8867--8887},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hoogeboom22a/hoogeboom22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hoogeboom22a.html},\n abstract = \t {This work introduces a diffusion model for molecule generation in 3D that is equivariant to Euclidean transformations. Our E(3) Equivariant Diffusion Model (EDM) learns to denoise a diffusion process with an equivariant network that jointly operates on both continuous (atom coordinates) and categorical features (atom types). In addition, we provide a probabilistic analysis which admits likelihood computation of molecules using our model. Experimentally, the proposed method significantly outperforms previous 3D molecular generative methods regarding the quality of generated samples and the efficiency at training time.}\n}", "pdf": "https://proceedings.mlr.press/v162/hoogeboom22a/hoogeboom22a.pdf", "supp": "", "pdf_size": 4839416, "gs_citation": 806, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9412014854490527272&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "UvA-Bosch Delta Lab, University of Amsterdam, Netherlands; UvA-Bosch Delta Lab, University of Amsterdam, Netherlands; EPFL, Lausanne, Switzerland; UvA-Bosch Delta Lab, University of Amsterdam, Netherlands", "aff_domain": "uva.nl;uva.nl;epfl.ch; ", "email": "uva.nl;uva.nl;epfl.ch; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/hoogeboom22a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Amsterdam;EPFL", "aff_unique_dep": "UvA-Bosch Delta Lab;", "aff_unique_url": "https://www.uva.nl;https://www.epfl.ch", "aff_unique_abbr": "UvA;EPFL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Netherlands;Switzerland" }, { "title": "Equivariant Priors for compressed sensing with unknown orientation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18279", "id": "18279", "proceeding": "https://proceedings.mlr.press/v162/kuzina22a.html", "poster": "/media/PosterPDFs/ICML%202022/363ce3cd61389226b4a55b2aee2dacd7_hBaNjl2.png?t=1658134156.8138032", "slides": "", "author_site": "Anna Kuzina, Kumar Pratik, Fabio Valerio Massoli, Arash Behboodi", "author": "Anna Kuzina; Kumar Pratik; Fabio Valerio Massoli; Arash Behboodi", "abstract": "In compressed sensing, the goal is to reconstruct the signal from an underdetermined system of linear measurements. Thus, prior knowledge about the signal of interest and its structure is required. Additionally, in many scenarios, the signal has an unknown orientation prior to measurements. To address such recovery problems, we propose using equivariant generative models as a prior, which encapsulate orientation information in their latent space. Thereby, we show that signals with unknown orientations can be recovered with iterative gradient descent on the latent space of these models and provide additional theoretical recovery guarantees. We construct an equivariant variational autoencoder and use the decoder as generative prior for compressed sensing. We discuss additional potential gains of the proposed approach in terms of convergence and latency.", "bibtex": "@InProceedings{pmlr-v162-kuzina22a,\n title = \t {Equivariant Priors for compressed sensing with unknown orientation},\n author = {Kuzina, Anna and Pratik, Kumar and Massoli, Fabio Valerio and Behboodi, Arash},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11753--11771},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kuzina22a/kuzina22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kuzina22a.html},\n abstract = \t {In compressed sensing, the goal is to reconstruct the signal from an underdetermined system of linear measurements. Thus, prior knowledge about the signal of interest and its structure is required. Additionally, in many scenarios, the signal has an unknown orientation prior to measurements. To address such recovery problems, we propose using equivariant generative models as a prior, which encapsulate orientation information in their latent space. Thereby, we show that signals with unknown orientations can be recovered with iterative gradient descent on the latent space of these models and provide additional theoretical recovery guarantees. We construct an equivariant variational autoencoder and use the decoder as generative prior for compressed sensing. We discuss additional potential gains of the proposed approach in terms of convergence and latency.}\n}", "pdf": "https://proceedings.mlr.press/v162/kuzina22a/kuzina22a.pdf", "supp": "", "pdf_size": 1450875, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11609669679486319648&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Vrije Universiteit Amsterdam, Netherlands + Qualcomm AI Research; Qualcomm AI Research, Qualcomm Technologies Netherlands B.V.; Qualcomm AI Research, Qualcomm Technologies Netherlands B.V.; Qualcomm AI Research, Qualcomm Technologies Netherlands B.V.", "aff_domain": "yandex.ru; ; ; ", "email": "yandex.ru; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/kuzina22a.html", "aff_unique_index": "0+1;2;2;2", "aff_unique_norm": "Vrije Universiteit Amsterdam;Qualcomm;Qualcomm Technologies Netherlands B.V.", "aff_unique_dep": ";Qualcomm AI Research;Qualcomm AI Research", "aff_unique_url": "https://www.vu.nl;https://www.qualcomm.com/research;https://www.qualcomm.com/research", "aff_unique_abbr": "VU Amsterdam;QAI;QTN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0", "aff_country_unique": "Netherlands;United States" }, { "title": "Equivariant Quantum Graph Circuits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16081", "id": "16081", "proceeding": "https://proceedings.mlr.press/v162/mernyei22a.html", "poster": "/media/PosterPDFs/ICML%202022/eab9c5e9815adc4c40a6557495eed6d3_eT4eJuj.png?t=1657573984.672819", "slides": "", "author_site": "Peter Mernyei, Konstantinos Meichanetzidis, Ismail Ceylan", "author": "Peter Mernyei; Konstantinos Meichanetzidis; Ismail Ilkan Ceylan", "abstract": "We investigate quantum circuits for graph representation learning, and propose equivariant quantum graph circuits (EQGCs), as a class of parameterized quantum circuits with strong relational inductive bias for learning over graph-structured data. Conceptually, EQGCs serve as a unifying framework for quantum graph representation learning, allowing us to define several interesting subclasses which subsume existing proposals. In terms of the representation power, we prove that the studied subclasses of EQGCs are universal approximators for functions over the bounded graph domain. This theoretical perspective on quantum graph machine learning methods opens many directions for further work, and could lead to models with capabilities beyond those of classical approaches. We empirically verify the expressive power of EQGCs through a dedicated experiment on synthetic data, and additionally observe that the performance of EQGCs scales well with the depth of the model and does not suffer from barren plateu issues.", "bibtex": "@InProceedings{pmlr-v162-mernyei22a,\n title = \t {Equivariant Quantum Graph Circuits},\n author = {Mernyei, Peter and Meichanetzidis, Konstantinos and Ceylan, Ismail Ilkan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15401--15420},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mernyei22a/mernyei22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mernyei22a.html},\n abstract = \t {We investigate quantum circuits for graph representation learning, and propose equivariant quantum graph circuits (EQGCs), as a class of parameterized quantum circuits with strong relational inductive bias for learning over graph-structured data. Conceptually, EQGCs serve as a unifying framework for quantum graph representation learning, allowing us to define several interesting subclasses which subsume existing proposals. In terms of the representation power, we prove that the studied subclasses of EQGCs are universal approximators for functions over the bounded graph domain. This theoretical perspective on quantum graph machine learning methods opens many directions for further work, and could lead to models with capabilities beyond those of classical approaches. We empirically verify the expressive power of EQGCs through a dedicated experiment on synthetic data, and additionally observe that the performance of EQGCs scales well with the depth of the model and does not suffer from barren plateu issues.}\n}", "pdf": "https://proceedings.mlr.press/v162/mernyei22a/mernyei22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/mernyei22a-supp.zip", "pdf_size": 781507, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18308759795742721052&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, University of Oxford, Oxford, UK + Charm Therapeutics, London, UK; Cambridge Quantum Computing and Quantinuum, Oxford, UK; Department of Computer Science, University of Oxford, Oxford, UK", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/mernyei22a.html", "aff_unique_index": "0+1;2;0", "aff_unique_norm": "University of Oxford;Charm Therapeutics;Cambridge Quantum Computing", "aff_unique_dep": "Department of Computer Science;;Quantum Computing", "aff_unique_url": "https://www.ox.ac.uk;;https://www.cambridgequantum.com", "aff_unique_abbr": "Oxford;;CQC", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Oxford;;Cambridge", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Error-driven Input Modulation: Solving the Credit Assignment Problem without a Backward Pass", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16251", "id": "16251", "proceeding": "https://proceedings.mlr.press/v162/dellaferrera22a.html", "poster": "/media/PosterPDFs/ICML%202022/6cf821bc98b2d343170185bb3de84cc4_5w4ef9r.png?t=1657526198.6316378", "slides": "", "author_site": "Giorgia Dellaferrera, Gabriel Kreiman", "author": "Giorgia Dellaferrera; Gabriel Kreiman", "abstract": "Supervised learning in artificial neural networks typically relies on backpropagation, where the weights are updated based on the error-function gradients and sequentially propagated from the output layer to the input layer. Although this approach has proven effective in a wide domain of applications, it lacks biological plausibility in many regards, including the weight symmetry problem, the dependence of learning on non-local signals, the freezing of neural activity during error propagation, and the update locking problem. Alternative training schemes have been introduced, including sign symmetry, feedback alignment, and direct feedback alignment, but they invariably rely on a backward pass that hinders the possibility of solving all the issues simultaneously. Here, we propose to replace the backward pass with a second forward pass in which the input signal is modulated based on the error of the network. We show that this novel learning rule comprehensively addresses all the above-mentioned issues and can be applied to both fully connected and convolutional models. We test this learning rule on MNIST, CIFAR-10, and CIFAR-100. These results help incorporate biological principles into machine learning.", "bibtex": "@InProceedings{pmlr-v162-dellaferrera22a,\n title = \t {Error-driven Input Modulation: Solving the Credit Assignment Problem without a Backward Pass},\n author = {Dellaferrera, Giorgia and Kreiman, Gabriel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4937--4955},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dellaferrera22a/dellaferrera22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dellaferrera22a.html},\n abstract = \t {Supervised learning in artificial neural networks typically relies on backpropagation, where the weights are updated based on the error-function gradients and sequentially propagated from the output layer to the input layer. Although this approach has proven effective in a wide domain of applications, it lacks biological plausibility in many regards, including the weight symmetry problem, the dependence of learning on non-local signals, the freezing of neural activity during error propagation, and the update locking problem. Alternative training schemes have been introduced, including sign symmetry, feedback alignment, and direct feedback alignment, but they invariably rely on a backward pass that hinders the possibility of solving all the issues simultaneously. Here, we propose to replace the backward pass with a second forward pass in which the input signal is modulated based on the error of the network. We show that this novel learning rule comprehensively addresses all the above-mentioned issues and can be applied to both fully connected and convolutional models. We test this learning rule on MNIST, CIFAR-10, and CIFAR-100. These results help incorporate biological principles into machine learning.}\n}", "pdf": "https://proceedings.mlr.press/v162/dellaferrera22a/dellaferrera22a.pdf", "supp": "", "pdf_size": 931744, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12440766337737848620&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "Center for Brains, Minds and Machines, Cambridge, MA, United States+Children\u2019s Hospital, Harvard Medical School, Boston, MA, United States+Institute of Neuroinformatics, University of Zurich and ETH Zurich, Switzerland; Center for Brains, Minds and Machines, Cambridge, MA, United States+Children\u2019s Hospital, Harvard Medical School, Boston, MA, United States", "aff_domain": "gmail.com;tch.harvard.edu", "email": "gmail.com;tch.harvard.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/dellaferrera22a.html", "aff_unique_index": "0+1+2;0+1", "aff_unique_norm": "Massachusetts Institute of Technology;Harvard Medical School;University of Zurich", "aff_unique_dep": "Center for Brains, Minds and Machines;Children\u2019s Hospital;Institute of Neuroinformatics", "aff_unique_url": "https://cbmm.mit.edu/;https://hms.harvard.edu;https://www.neuro.ethz.ch/", "aff_unique_abbr": "MIT;HMS;UZH", "aff_campus_unique_index": "0+1;0+1", "aff_campus_unique": "Cambridge;Boston;", "aff_country_unique_index": "0+0+1;0+0", "aff_country_unique": "United States;Switzerland" }, { "title": "Estimating Instance-dependent Bayes-label Transition Matrix using a Deep Neural Network", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16011", "id": "16011", "proceeding": "https://proceedings.mlr.press/v162/yang22p.html", "poster": "", "slides": "", "author_site": "Shuo Yang, Erkun Yang, Bo Han, Yang Liu, Min Xu, Gang Niu, Tongliang Liu", "author": "Shuo Yang; Erkun Yang; Bo Han; Yang Liu; Min Xu; Gang Niu; Tongliang Liu", "abstract": "In label-noise learning, estimating the transition matrix is a hot topic as the matrix plays an important role in building statistically consistent classifiers. Traditionally, the transition from clean labels to noisy labels (i.e., clean-label transition matrix (CLTM)) has been widely exploited to learn a clean label classifier by employing the noisy data. Motivated by that classifiers mostly output Bayes optimal labels for prediction, in this paper, we study to directly model the transition from Bayes optimal labels to noisy labels (i.e., Bayes-label transition matrix (BLTM)) and learn a classifier to predict Bayes optimal labels. Note that given only noisy data, it is ill-posed to estimate either the CLTM or the BLTM. But favorably, Bayes optimal labels have less uncertainty compared with the clean labels, i.e., the class posteriors of Bayes optimal labels are one-hot vectors while those of clean labels are not. This enables two advantages to estimate the BLTM, i.e., (a) a set of examples with theoretically guaranteed Bayes optimal labels can be collected out of noisy data; (b) the feasible solution space is much smaller. By exploiting the advantages, we estimate the BLTM parametrically by employing a deep neural network, leading to better generalization and superior classification performance.", "bibtex": "@InProceedings{pmlr-v162-yang22p,\n title = \t {Estimating Instance-dependent {B}ayes-label Transition Matrix using a Deep Neural Network},\n author = {Yang, Shuo and Yang, Erkun and Han, Bo and Liu, Yang and Xu, Min and Niu, Gang and Liu, Tongliang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25302--25312},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22p/yang22p.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22p.html},\n abstract = \t {In label-noise learning, estimating the transition matrix is a hot topic as the matrix plays an important role in building statistically consistent classifiers. Traditionally, the transition from clean labels to noisy labels (i.e., clean-label transition matrix (CLTM)) has been widely exploited to learn a clean label classifier by employing the noisy data. Motivated by that classifiers mostly output Bayes optimal labels for prediction, in this paper, we study to directly model the transition from Bayes optimal labels to noisy labels (i.e., Bayes-label transition matrix (BLTM)) and learn a classifier to predict Bayes optimal labels. Note that given only noisy data, it is ill-posed to estimate either the CLTM or the BLTM. But favorably, Bayes optimal labels have less uncertainty compared with the clean labels, i.e., the class posteriors of Bayes optimal labels are one-hot vectors while those of clean labels are not. This enables two advantages to estimate the BLTM, i.e., (a) a set of examples with theoretically guaranteed Bayes optimal labels can be collected out of noisy data; (b) the feasible solution space is much smaller. By exploiting the advantages, we estimate the BLTM parametrically by employing a deep neural network, leading to better generalization and superior classification performance.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22p/yang22p.pdf", "supp": "", "pdf_size": 1368724, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3064369578542093228&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/yang22p.html" }, { "title": "Estimating and Penalizing Induced Preference Shifts in Recommender Systems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17365", "id": "17365", "proceeding": "https://proceedings.mlr.press/v162/carroll22a.html", "poster": "/media/PosterPDFs/ICML%202022/3eae62bba9ddf64f69d49dc48e2dd214.png?t=1657545286.3718412", "slides": "", "author_site": "Micah Carroll, Anca Dragan, Stuart Russell, Dylan Hadfield-Menell", "author": "Micah D Carroll; Anca Dragan; Stuart Russell; Dylan Hadfield-Menell", "abstract": "The content that a recommender system (RS) shows to users influences them. Therefore, when choosing a recommender to deploy, one is implicitly also choosing to induce specific internal states in users. Even more, systems trained via long-horizon optimization will have direct incentives to manipulate users, e.g. shift their preferences so they are easier to satisfy. We focus on induced preference shifts in users. We argue that {\u2013} before deployment {\u2013} system designers should: estimate the shifts a recommender would induce; evaluate whether such shifts would be undesirable; and perhaps even actively optimize to avoid problematic shifts. These steps involve two challenging ingredients: estimation requires anticipating how hypothetical policies would influence user preferences if deployed {\u2013} we do this by using historical user interaction data to train a predictive user model which implicitly contains their preference dynamics; evaluation and optimization additionally require metrics to assess whether such influences are manipulative or otherwise unwanted {\u2013} we use the notion of \"safe shifts\", that define a trust region within which behavior is safe: for instance, the natural way in which users would shift without interference from the system could be deemed \"safe\". In simulated experiments, we show that our learned preference dynamics model is effective in estimating user preferences and how they would respond to new recommenders. Additionally, we show that recommenders that optimize for staying in the trust region can avoid manipulative behaviors while still generating engagement.", "bibtex": "@InProceedings{pmlr-v162-carroll22a,\n title = \t {Estimating and Penalizing Induced Preference Shifts in Recommender Systems},\n author = {Carroll, Micah D and Dragan, Anca and Russell, Stuart and Hadfield-Menell, Dylan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2686--2708},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/carroll22a/carroll22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/carroll22a.html},\n abstract = \t {The content that a recommender system (RS) shows to users influences them. Therefore, when choosing a recommender to deploy, one is implicitly also choosing to induce specific internal states in users. Even more, systems trained via long-horizon optimization will have direct incentives to manipulate users, e.g. shift their preferences so they are easier to satisfy. We focus on induced preference shifts in users. We argue that {\u2013} before deployment {\u2013} system designers should: estimate the shifts a recommender would induce; evaluate whether such shifts would be undesirable; and perhaps even actively optimize to avoid problematic shifts. These steps involve two challenging ingredients: estimation requires anticipating how hypothetical policies would influence user preferences if deployed {\u2013} we do this by using historical user interaction data to train a predictive user model which implicitly contains their preference dynamics; evaluation and optimization additionally require metrics to assess whether such influences are manipulative or otherwise unwanted {\u2013} we use the notion of \"safe shifts\", that define a trust region within which behavior is safe: for instance, the natural way in which users would shift without interference from the system could be deemed \"safe\". In simulated experiments, we show that our learned preference dynamics model is effective in estimating user preferences and how they would respond to new recommenders. Additionally, we show that recommenders that optimize for staying in the trust region can avoid manipulative behaviors while still generating engagement.}\n}", "pdf": "https://proceedings.mlr.press/v162/carroll22a/carroll22a.pdf", "supp": "", "pdf_size": 2332371, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3587500308278200758&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Berkeley; Berkeley; Berkeley; MIT", "aff_domain": "berkeley.edu; ; ; ", "email": "berkeley.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/carroll22a.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of California, Berkeley;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://web.mit.edu", "aff_unique_abbr": "UC Berkeley;MIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Estimating the Optimal Covariance with Imperfect Mean in Diffusion Probabilistic Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16751", "id": "16751", "proceeding": "https://proceedings.mlr.press/v162/bao22d.html", "poster": "/media/PosterPDFs/ICML%202022/36a16a2505369e0c922b6ea7a23a56d2.png?t=1656746571.9049275", "slides": "/media/icml-2022/Slides/16751_I3rNZYq.pdf", "author_site": "Fan Bao, Chongxuan Li, Jiacheng Sun, Jun Zhu, Bo Zhang", "author": "Fan Bao; Chongxuan Li; Jiacheng Sun; Jun Zhu; Bo Zhang", "abstract": "Diffusion probabilistic models (DPMs) are a class of powerful deep generative models (DGMs). Despite their success, the iterative generation process over the full timesteps is much less efficient than other DGMs such as GANs. Thus, the generation performance on a subset of timesteps is crucial, which is greatly influenced by the covariance design in DPMs. In this work, we consider diagonal and full covariances to improve the expressive power of DPMs. We derive the optimal result for such covariances, and then correct it when the mean of DPMs is imperfect. Both the optimal and the corrected ones can be decomposed into terms of conditional expectations over functions of noise. Building upon it, we propose to estimate the optimal covariance and its correction given imperfect mean by learning these conditional expectations. Our method can be applied to DPMs with both discrete and continuous timesteps. We consider the diagonal covariance in our implementation for computational efficiency. For an efficient practical implementation, we adopt a parameter sharing scheme and a two-stage training process. Empirically, our method outperforms a wide variety of covariance design on likelihood results, and improves the sample quality especially on a small number of timesteps.", "bibtex": "@InProceedings{pmlr-v162-bao22d,\n title = \t {Estimating the Optimal Covariance with Imperfect Mean in Diffusion Probabilistic Models},\n author = {Bao, Fan and Li, Chongxuan and Sun, Jiacheng and Zhu, Jun and Zhang, Bo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1555--1584},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bao22d/bao22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/bao22d.html},\n abstract = \t {Diffusion probabilistic models (DPMs) are a class of powerful deep generative models (DGMs). Despite their success, the iterative generation process over the full timesteps is much less efficient than other DGMs such as GANs. Thus, the generation performance on a subset of timesteps is crucial, which is greatly influenced by the covariance design in DPMs. In this work, we consider diagonal and full covariances to improve the expressive power of DPMs. We derive the optimal result for such covariances, and then correct it when the mean of DPMs is imperfect. Both the optimal and the corrected ones can be decomposed into terms of conditional expectations over functions of noise. Building upon it, we propose to estimate the optimal covariance and its correction given imperfect mean by learning these conditional expectations. Our method can be applied to DPMs with both discrete and continuous timesteps. We consider the diagonal covariance in our implementation for computational efficiency. For an efficient practical implementation, we adopt a parameter sharing scheme and a two-stage training process. Empirically, our method outperforms a wide variety of covariance design on likelihood results, and improves the sample quality especially on a small number of timesteps.}\n}", "pdf": "https://proceedings.mlr.press/v162/bao22d/bao22d.pdf", "supp": "", "pdf_size": 27163617, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2323665209976347341&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Dept. of Comp. Sci. & Tech., Institute for AI, Tsinghua-Huawei Joint Center for AI, BNRist Center, State Key Lab for Intell. Tech. & Sys., Tsinghua University; Gaoling School of AI, Renmin University of China; Beijing Key Lab of Big Data Management and Analysis Methods, Beijing, China; Huawei Noah\u2019s Ark Lab; Dept. of Comp. Sci. & Tech., Institute for AI, Tsinghua-Huawei Joint Center for AI, BNRist Center, State Key Lab for Intell. Tech. & Sys., Tsinghua University", "aff_domain": "tsinghua.edu.cn;ruc.edu.cn;huawei.com;tsinghua.edu.cn;tsinghua.edu.cn", "email": "tsinghua.edu.cn;ruc.edu.cn;huawei.com;tsinghua.edu.cn;tsinghua.edu.cn", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/bao22d.html", "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Tsinghua University;Renmin University of China;Beijing Key Lab of Big Data Management and Analysis Methods;Huawei", "aff_unique_dep": "Dept. of Comp. Sci. & Tech.;Gaoling School of AI;Big Data Management and Analysis;Noah\u2019s Ark Lab", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.ruc.edu.cn;;https://www.huawei.com", "aff_unique_abbr": "THU;RUC;;Huawei", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Estimation in Rotationally Invariant Generalized Linear Models via Approximate Message Passing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16163", "id": "16163", "proceeding": "https://proceedings.mlr.press/v162/venkataramanan22a.html", "poster": "/media/PosterPDFs/ICML%202022/766ebcd59621e305170616ba3d3dac32.png?t=1657192735.724726", "slides": "", "author_site": "Ramji Venkataramanan, Kevin K\u00f6gler, Marco Mondelli", "author": "Ramji Venkataramanan; Kevin K\u00f6gler; Marco Mondelli", "abstract": "We consider the problem of signal estimation in generalized linear models defined via rotationally invariant design matrices. Since these matrices can have an arbitrary spectral distribution, this model is well suited for capturing complex correlation structures which often arise in applications. We propose a novel family of approximate message passing (AMP) algorithms for signal estimation, and rigorously characterize their performance in the high-dimensional limit via a state evolution recursion. Our rotationally invariant AMP has complexity of the same order as the existing AMP derived under the restrictive assumption of a Gaussian design; our algorithm also recovers this existing AMP as a special case. Numerical results showcase a performance close to Vector AMP (which is conjectured to be Bayes-optimal in some settings), but obtained with a much lower complexity, as the proposed algorithm does not require a computationally expensive singular value decomposition.", "bibtex": "@InProceedings{pmlr-v162-venkataramanan22a,\n title = \t {Estimation in Rotationally Invariant Generalized Linear Models via Approximate Message Passing},\n author = {Venkataramanan, Ramji and K{\\\"o}gler, Kevin and Mondelli, Marco},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22120--22144},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/venkataramanan22a/venkataramanan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/venkataramanan22a.html},\n abstract = \t {We consider the problem of signal estimation in generalized linear models defined via rotationally invariant design matrices. Since these matrices can have an arbitrary spectral distribution, this model is well suited for capturing complex correlation structures which often arise in applications. We propose a novel family of approximate message passing (AMP) algorithms for signal estimation, and rigorously characterize their performance in the high-dimensional limit via a state evolution recursion. Our rotationally invariant AMP has complexity of the same order as the existing AMP derived under the restrictive assumption of a Gaussian design; our algorithm also recovers this existing AMP as a special case. Numerical results showcase a performance close to Vector AMP (which is conjectured to be Bayes-optimal in some settings), but obtained with a much lower complexity, as the proposed algorithm does not require a computationally expensive singular value decomposition.}\n}", "pdf": "https://proceedings.mlr.press/v162/venkataramanan22a/venkataramanan22a.pdf", "supp": "", "pdf_size": 2341343, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12969908349905111775&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "University of Cambridge, United Kingdom; ISTA, Austria; ISTA, Austria", "aff_domain": "cam.ac.uk; ;ist.ac.at", "email": "cam.ac.uk; ;ist.ac.at", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/venkataramanan22a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Cambridge;Institute of Science and Technology Austria", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://www.ista.ac.at", "aff_unique_abbr": "Cambridge;ISTA", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United Kingdom;Austria" }, { "title": "Evaluating the Adversarial Robustness of Adaptive Test-time Defenses", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18041", "id": "18041", "proceeding": "https://proceedings.mlr.press/v162/croce22a.html", "poster": "/media/PosterPDFs/ICML%202022/a91bc76c2a6302e573badedcbf57bf7a.png?t=1657923945.5248916", "slides": "/media/icml-2022/Slides/18041.pdf", "author_site": "Francesco Croce, Sven Gowal, Thomas Brunner, Evan Shelhamer, Matthias Hein, Taylan Cemgil", "author": "Francesco Croce; Sven Gowal; Thomas Brunner; Evan Shelhamer; Matthias Hein; Taylan Cemgil", "abstract": "Adaptive defenses, which optimize at test time, promise to improve adversarial robustness. We categorize such adaptive test-time defenses, explain their potential benefits and drawbacks, and evaluate a representative variety of the latest adaptive defenses for image classification. Unfortunately, none significantly improve upon static defenses when subjected to our careful case study evaluation. Some even weaken the underlying static model while simultaneously increasing inference computation. While these results are disappointing, we still believe that adaptive test-time defenses are a promising avenue of research and, as such, we provide recommendations for their thorough evaluation. We extend the checklist of Carlini et al. (2019) by providing concrete steps specific to adaptive defenses.", "bibtex": "@InProceedings{pmlr-v162-croce22a,\n title = \t {Evaluating the Adversarial Robustness of Adaptive Test-time Defenses},\n author = {Croce, Francesco and Gowal, Sven and Brunner, Thomas and Shelhamer, Evan and Hein, Matthias and Cemgil, Taylan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4421--4435},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/croce22a/croce22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/croce22a.html},\n abstract = \t {Adaptive defenses, which optimize at test time, promise to improve adversarial robustness. We categorize such adaptive test-time defenses, explain their potential benefits and drawbacks, and evaluate a representative variety of the latest adaptive defenses for image classification. Unfortunately, none significantly improve upon static defenses when subjected to our careful case study evaluation. Some even weaken the underlying static model while simultaneously increasing inference computation. While these results are disappointing, we still believe that adaptive test-time defenses are a promising avenue of research and, as such, we provide recommendations for their thorough evaluation. We extend the checklist of Carlini et al. (2019) by providing concrete steps specific to adaptive defenses.}\n}", "pdf": "https://proceedings.mlr.press/v162/croce22a/croce22a.pdf", "supp": "", "pdf_size": 1031725, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9007385894917173233&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/croce22a.html" }, { "title": "Evolving Curricula with Regret-Based Environment Design", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16333", "id": "16333", "proceeding": "https://proceedings.mlr.press/v162/parker-holder22a.html", "poster": "/media/PosterPDFs/ICML%202022/a3f390d88e4c41f2747bfa2f1b5f87db_jy21dtt.png?t=1657556038.3368232", "slides": "", "author_site": "Jack Parker-Holder, Minqi Jiang, Michael Dennis, Mikayel Samvelyan, Jakob Foerster, Edward Grefenstette, Tim Rockt\u00e4schel", "author": "Jack Parker-Holder; Minqi Jiang; Michael Dennis; Mikayel Samvelyan; Jakob Foerster; Edward Grefenstette; Tim Rockt\u00e4schel", "abstract": "Training generally-capable agents with reinforcement learning (RL) remains a significant challenge. A promising avenue for improving the robustness of RL agents is through the use of curricula. One such class of methods frames environment design as a game between a student and a teacher, using regret-based objectives to produce environment instantiations (or levels) at the frontier of the student agent\u2019s capabilities. These methods benefit from theoretical robustness guarantees at equilibrium, yet they often struggle to find effective levels in challenging design spaces in practice. By contrast, evolutionary approaches incrementally alter environment complexity, resulting in potentially open-ended learning, but often rely on domain-specific heuristics and vast amounts of computational resources. This work proposes harnessing the power of evolution in a principled, regret-based curriculum. Our approach, which we call Adversarially Compounding Complexity by Editing Levels (ACCEL), seeks to constantly produce levels at the frontier of an agent\u2019s capabilities, resulting in curricula that start simple but become increasingly complex. ACCEL maintains the theoretical benefits of prior regret-based methods, while providing significant empirical gains in a diverse set of environments. An interactive version of this paper is available at https://accelagent.github.io.", "bibtex": "@InProceedings{pmlr-v162-parker-holder22a,\n title = \t {Evolving Curricula with Regret-Based Environment Design},\n author = {Parker-Holder, Jack and Jiang, Minqi and Dennis, Michael and Samvelyan, Mikayel and Foerster, Jakob and Grefenstette, Edward and Rockt{\\\"a}schel, Tim},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17473--17498},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/parker-holder22a/parker-holder22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/parker-holder22a.html},\n abstract = \t {Training generally-capable agents with reinforcement learning (RL) remains a significant challenge. A promising avenue for improving the robustness of RL agents is through the use of curricula. One such class of methods frames environment design as a game between a student and a teacher, using regret-based objectives to produce environment instantiations (or levels) at the frontier of the student agent\u2019s capabilities. These methods benefit from theoretical robustness guarantees at equilibrium, yet they often struggle to find effective levels in challenging design spaces in practice. By contrast, evolutionary approaches incrementally alter environment complexity, resulting in potentially open-ended learning, but often rely on domain-specific heuristics and vast amounts of computational resources. This work proposes harnessing the power of evolution in a principled, regret-based curriculum. Our approach, which we call Adversarially Compounding Complexity by Editing Levels (ACCEL), seeks to constantly produce levels at the frontier of an agent\u2019s capabilities, resulting in curricula that start simple but become increasingly complex. ACCEL maintains the theoretical benefits of prior regret-based methods, while providing significant empirical gains in a diverse set of environments. An interactive version of this paper is available at https://accelagent.github.io.}\n}", "pdf": "https://proceedings.mlr.press/v162/parker-holder22a/parker-holder22a.pdf", "supp": "", "pdf_size": 6593210, "gs_citation": 148, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5291399435373033204&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Meta AI+University of Oxford+UCL; Meta AI+UCL+University of Oxford; UC Berkeley; Meta AI+UCL; University of Oxford; Meta AI+UCL; Meta AI+UCL", "aff_domain": "robots.ox.ac.uk;fb.com; ; ; ; ; ", "email": "robots.ox.ac.uk;fb.com; ; ; ; ; ", "github": "https://accelagent.github.io", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/parker-holder22a.html", "aff_unique_index": "0+1+2;0+2+1;3;0+2;1;0+2;0+2", "aff_unique_norm": "Meta;University of Oxford;University College London;University of California, Berkeley", "aff_unique_dep": "Meta AI;;;", "aff_unique_url": "https://meta.com;https://www.ox.ac.uk;https://www.ucl.ac.uk;https://www.berkeley.edu", "aff_unique_abbr": "Meta;Oxford;UCL;UC Berkeley", "aff_campus_unique_index": ";;1;;;", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0+1+1;0+1+1;0;0+1;1;0+1;0+1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Exact Learning of Preference Structure: Single-peaked Preferences and Beyond", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17949", "id": "17949", "proceeding": "https://proceedings.mlr.press/v162/kraiczy22a.html", "poster": "/media/PosterPDFs/ICML%202022/c4c455df3c54f292ae22f6791fd2553e.png?t=1658401801.6743157", "slides": "", "author_site": "Sonja Kraiczy, Edith Elkind", "author": "Sonja Kraiczy; Edith Elkind", "abstract": "We consider the setting where the members of a society (voters) have preferences over candidates, and the candidates can be ordered on an axis so that the voters\u2019 preferences are single-peaked on this axis. We ask whether this axis can be identified by sampling the voters\u2019 preferences. For several natural distributions, we obtain tight bounds on the number of samples required and show that, surprisingly, the bounds are independent of the number of candidates. We extend our results to the case where voters\u2019 preferences are sampled from two different axes over the same candidate set (one of which may be known). We also consider two alternative models of learning: (1) sampling pairwise comparisons rather than entire votes, and (2) learning from equivalence queries.", "bibtex": "@InProceedings{pmlr-v162-kraiczy22a,\n title = \t {Exact Learning of Preference Structure: Single-peaked Preferences and Beyond},\n author = {Kraiczy, Sonja and Elkind, Edith},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11598--11612},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kraiczy22a/kraiczy22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kraiczy22a.html},\n abstract = \t {We consider the setting where the members of a society (voters) have preferences over candidates, and the candidates can be ordered on an axis so that the voters\u2019 preferences are single-peaked on this axis. We ask whether this axis can be identified by sampling the voters\u2019 preferences. For several natural distributions, we obtain tight bounds on the number of samples required and show that, surprisingly, the bounds are independent of the number of candidates. We extend our results to the case where voters\u2019 preferences are sampled from two different axes over the same candidate set (one of which may be known). We also consider two alternative models of learning: (1) sampling pairwise comparisons rather than entire votes, and (2) learning from equivalence queries.}\n}", "pdf": "https://proceedings.mlr.press/v162/kraiczy22a/kraiczy22a.pdf", "supp": "", "pdf_size": 384804, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6696941738554964711&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, University of Oxford; Department of Computer Science, University of Oxford", "aff_domain": "ox.ac.uk;ox.ac.uk", "email": "ox.ac.uk;ox.ac.uk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/kraiczy22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Exact Optimal Accelerated Complexity for Fixed-Point Iterations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17039", "id": "17039", "proceeding": "https://proceedings.mlr.press/v162/park22c.html", "poster": "/media/PosterPDFs/ICML%202022/f5f8590cd58a54e94377e6ae2eded4d9.png?t=1657899105.5551639", "slides": "", "author_site": "Jisun Park, Ernest Ryu", "author": "Jisun Park; Ernest K Ryu", "abstract": "Despite the broad use of fixed-point iterations throughout applied mathematics, the optimal convergence rate of general fixed-point problems with nonexpansive nonlinear operators has not been established. This work presents an acceleration mechanism for fixed-point iterations with nonexpansive operators, contractive operators, and nonexpansive operators satisfying a H\u00f6lder-type growth condition. We then provide matching complexity lower bounds to establish the exact optimality of the acceleration mechanisms in the nonexpansive and contractive setups. Finally, we provide experiments with CT imaging, optimal transport, and decentralized optimization to demonstrate the practical effectiveness of the acceleration mechanism.", "bibtex": "@InProceedings{pmlr-v162-park22c,\n title = \t {Exact Optimal Accelerated Complexity for Fixed-Point Iterations},\n author = {Park, Jisun and Ryu, Ernest K},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17420--17457},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/park22c/park22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/park22c.html},\n abstract = \t {Despite the broad use of fixed-point iterations throughout applied mathematics, the optimal convergence rate of general fixed-point problems with nonexpansive nonlinear operators has not been established. This work presents an acceleration mechanism for fixed-point iterations with nonexpansive operators, contractive operators, and nonexpansive operators satisfying a H\u00f6lder-type growth condition. We then provide matching complexity lower bounds to establish the exact optimality of the acceleration mechanisms in the nonexpansive and contractive setups. Finally, we provide experiments with CT imaging, optimal transport, and decentralized optimization to demonstrate the practical effectiveness of the acceleration mechanism.}\n}", "pdf": "https://proceedings.mlr.press/v162/park22c/park22c.pdf", "supp": "", "pdf_size": 1712068, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8156976472593333819&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Mathematical Sciences, Seoul National University; Department of Mathematical Sciences, Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr", "email": "snu.ac.kr;snu.ac.kr", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/park22c.html", "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "Department of Mathematical Sciences", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Examining Scaling and Transfer of Language Model Architectures for Machine Translation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17047", "id": "17047", "proceeding": "https://proceedings.mlr.press/v162/zhang22h.html", "poster": "/media/PosterPDFs/ICML%202022/d14388bb836687ff2b16b7bee6bab182_ZYTanaU.png?t=1656948852.2552834", "slides": "/media/icml-2022/Slides/17047.pdf", "author_site": "Biao Zhang, Behrooz Ghorbani, Ankur Bapna, Yong Cheng, Xavier Garcia, Jonathan Shen, Orhan Firat", "author": "Biao Zhang; Behrooz Ghorbani; Ankur Bapna; Yong Cheng; Xavier Garcia; Jonathan Shen; Orhan Firat", "abstract": "Natural language understanding and generation models follow one of the two dominant architectural paradigms: language models (LMs) that process concatenated sequences in a single stack of layers, and encoder-decoder models (EncDec) that utilize separate layer stacks for input and output processing. In machine translation, EncDec has long been the favoured approach, but with few studies investigating the performance of LMs. In this work, we thoroughly examine the role of several architectural design choices on the performance of LMs on bilingual, (massively) multilingual and zero-shot translation tasks, under systematic variations of data conditions and model sizes. Our results show that: (i) Different LMs have different scaling properties, where architectural differences often have a significant impact on model performance at small scales, but the performance gap narrows as the number of parameters increases, (ii) Several design choices, including causal masking and language-modeling objectives for the source sequence, have detrimental effects on translation quality, and (iii) When paired with full-visible masking for source sequences, LMs could perform on par with EncDec on supervised bilingual and multilingual translation tasks, and improve greatly on zero-shot directions by facilitating the reduction of off-target translations.", "bibtex": "@InProceedings{pmlr-v162-zhang22h,\n title = \t {Examining Scaling and Transfer of Language Model Architectures for Machine Translation},\n author = {Zhang, Biao and Ghorbani, Behrooz and Bapna, Ankur and Cheng, Yong and Garcia, Xavier and Shen, Jonathan and Firat, Orhan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26176--26192},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22h/zhang22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22h.html},\n abstract = \t {Natural language understanding and generation models follow one of the two dominant architectural paradigms: language models (LMs) that process concatenated sequences in a single stack of layers, and encoder-decoder models (EncDec) that utilize separate layer stacks for input and output processing. In machine translation, EncDec has long been the favoured approach, but with few studies investigating the performance of LMs. In this work, we thoroughly examine the role of several architectural design choices on the performance of LMs on bilingual, (massively) multilingual and zero-shot translation tasks, under systematic variations of data conditions and model sizes. Our results show that: (i) Different LMs have different scaling properties, where architectural differences often have a significant impact on model performance at small scales, but the performance gap narrows as the number of parameters increases, (ii) Several design choices, including causal masking and language-modeling objectives for the source sequence, have detrimental effects on translation quality, and (iii) When paired with full-visible masking for source sequences, LMs could perform on par with EncDec on supervised bilingual and multilingual translation tasks, and improve greatly on zero-shot directions by facilitating the reduction of off-target translations.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22h/zhang22h.pdf", "supp": "", "pdf_size": 915588, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13104352547978388815&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "School of Informatics, University of Edinburgh+Google Research; Google Research; Google Research; Google Research; Google Research; Google Research; Google Research", "aff_domain": "ed.ac.uk; ; ; ; ; ;google.com", "email": "ed.ac.uk; ; ; ; ; ;google.com", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/zhang22h.html", "aff_unique_index": "0+1;1;1;1;1;1;1", "aff_unique_norm": "University of Edinburgh;Google", "aff_unique_dep": "School of Informatics;Google Research", "aff_unique_url": "https://www.ed.ac.uk;https://research.google", "aff_unique_abbr": "Edinburgh;Google Research", "aff_campus_unique_index": "0+1;1;1;1;1;1;1", "aff_campus_unique": "Edinburgh;Mountain View", "aff_country_unique_index": "0+1;1;1;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Exploiting Independent Instruments: Identification and Distribution Generalization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16861", "id": "16861", "proceeding": "https://proceedings.mlr.press/v162/saengkyongam22a.html", "poster": "/media/PosterPDFs/ICML%202022/39dd987a9d27f1045aa0ad3ed5995dd2.png?t=1657533237.949285", "slides": "", "author_site": "Sorawit Saengkyongam, Leonard Henckel, Niklas Pfister, Jonas Peters", "author": "Sorawit Saengkyongam; Leonard Henckel; Niklas Pfister; Jonas Peters", "abstract": "Instrumental variable models allow us to identify a causal function between covariates $X$ and a response $Y$, even in the presence of unobserved confounding. Most of the existing estimators assume that the error term in the response $Y$ and the hidden confounders are uncorrelated with the instruments $Z$. This is often motivated by a graphical separation, an argument that also justifies independence. Positing an independence restriction, however, leads to strictly stronger identifiability results. We connect to the existing literature in econometrics and provide a practical method called HSIC-X for exploiting independence that can be combined with any gradient-based learning procedure. We see that even in identifiable settings, taking into account higher moments may yield better finite sample results. Furthermore, we exploit the independence for distribution generalization. We prove that the proposed estimator is invariant to distributional shifts on the instruments and worst-case optimal whenever these shifts are sufficiently strong. These results hold even in the under-identified case where the instruments are not sufficiently rich to identify the causal function.", "bibtex": "@InProceedings{pmlr-v162-saengkyongam22a,\n title = \t {Exploiting Independent Instruments: Identification and Distribution Generalization},\n author = {Saengkyongam, Sorawit and Henckel, Leonard and Pfister, Niklas and Peters, Jonas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18935--18958},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/saengkyongam22a/saengkyongam22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/saengkyongam22a.html},\n abstract = \t {Instrumental variable models allow us to identify a causal function between covariates $X$ and a response $Y$, even in the presence of unobserved confounding. Most of the existing estimators assume that the error term in the response $Y$ and the hidden confounders are uncorrelated with the instruments $Z$. This is often motivated by a graphical separation, an argument that also justifies independence. Positing an independence restriction, however, leads to strictly stronger identifiability results. We connect to the existing literature in econometrics and provide a practical method called HSIC-X for exploiting independence that can be combined with any gradient-based learning procedure. We see that even in identifiable settings, taking into account higher moments may yield better finite sample results. Furthermore, we exploit the independence for distribution generalization. We prove that the proposed estimator is invariant to distributional shifts on the instruments and worst-case optimal whenever these shifts are sufficiently strong. These results hold even in the under-identified case where the instruments are not sufficiently rich to identify the causal function.}\n}", "pdf": "https://proceedings.mlr.press/v162/saengkyongam22a/saengkyongam22a.pdf", "supp": "", "pdf_size": 889631, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7573181679595557794&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Mathematical Sciences, University of Copenhagen, Denmark; Department of Mathematical Sciences, University of Copenhagen, Denmark; Department of Mathematical Sciences, University of Copenhagen, Denmark; Department of Mathematical Sciences, University of Copenhagen, Denmark", "aff_domain": "math.ku.dk; ; ; ", "email": "math.ku.dk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/saengkyongam22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Copenhagen", "aff_unique_dep": "Department of Mathematical Sciences", "aff_unique_url": "https://www.ku.dk", "aff_unique_abbr": "UCPH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Denmark" }, { "title": "Exploiting Redundancy: Separable Group Convolutional Networks on Lie Groups", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17825", "id": "17825", "proceeding": "https://proceedings.mlr.press/v162/knigge22a.html", "poster": "/media/PosterPDFs/ICML%202022/b8b4b727d6f5d1b61fff7be687f7970f.png?t=1658339669.4886634", "slides": "", "author_site": "David Knigge, David Romero, Erik Bekkers", "author": "David M. Knigge; David W Romero; Erik J Bekkers", "abstract": "Group convolutional neural networks (G-CNNs) have been shown to increase parameter efficiency and model accuracy by incorporating geometric inductive biases. In this work, we investigate the properties of representations learned by regular G-CNNs, and show considerable parameter redundancy in group convolution kernels. This finding motivates further weight-tying by sharing convolution kernels over subgroups. To this end, we introduce convolution kernels that are separable over the subgroup and channel dimensions. In order to obtain equivariance to arbitrary affine Lie groups we provide a continuous parameterisation of separable convolution kernels. We evaluate our approach across several vision datasets, and show that our weight sharing leads to improved performance and computational efficiency. In many settings, separable G-CNNs outperform their non-separable counterpart, while only using a fraction of their training time. In addition, thanks to the increase in computational efficiency, we are able to implement G-CNNs equivariant to the $\\mathrm{Sim(2)}$ group; the group of dilations, rotations and translations of the plane. $\\mathrm{Sim(2)}$-equivariance further improves performance on all tasks considered, and achieves state-of-the-art performance on rotated MNIST.", "bibtex": "@InProceedings{pmlr-v162-knigge22a,\n title = \t {Exploiting Redundancy: Separable Group Convolutional Networks on Lie Groups},\n author = {Knigge, David M. and Romero, David W and Bekkers, Erik J},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11359--11386},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/knigge22a/knigge22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/knigge22a.html},\n abstract = \t {Group convolutional neural networks (G-CNNs) have been shown to increase parameter efficiency and model accuracy by incorporating geometric inductive biases. In this work, we investigate the properties of representations learned by regular G-CNNs, and show considerable parameter redundancy in group convolution kernels. This finding motivates further weight-tying by sharing convolution kernels over subgroups. To this end, we introduce convolution kernels that are separable over the subgroup and channel dimensions. In order to obtain equivariance to arbitrary affine Lie groups we provide a continuous parameterisation of separable convolution kernels. We evaluate our approach across several vision datasets, and show that our weight sharing leads to improved performance and computational efficiency. In many settings, separable G-CNNs outperform their non-separable counterpart, while only using a fraction of their training time. In addition, thanks to the increase in computational efficiency, we are able to implement G-CNNs equivariant to the $\\mathrm{Sim(2)}$ group; the group of dilations, rotations and translations of the plane. $\\mathrm{Sim(2)}$-equivariance further improves performance on all tasks considered, and achieves state-of-the-art performance on rotated MNIST.}\n}", "pdf": "https://proceedings.mlr.press/v162/knigge22a/knigge22a.pdf", "supp": "", "pdf_size": 4353947, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15152080644760721791&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "University of Amsterdam, The Netherlands; Vrije Univer-siteit Amsterdam, The Netherlands; University of Amsterdam, The Netherlands", "aff_domain": "uva.nl; ; ", "email": "uva.nl; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/knigge22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Amsterdam;Vrije Universiteit Amsterdam", "aff_unique_dep": ";", "aff_unique_url": "https://www.uva.nl;https://www.vu.nl", "aff_unique_abbr": "UvA;VU Amsterdam", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amsterdam", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Netherlands" }, { "title": "Exploring and Exploiting Hubness Priors for High-Quality GAN Latent Sampling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16223", "id": "16223", "proceeding": "https://proceedings.mlr.press/v162/liang22b.html", "poster": "/media/PosterPDFs/ICML%202022/2ba8698b79439589fdd2b0f7218d8b07.png?t=1657187588.6221328", "slides": "/media/icml-2022/Slides/16223_u087A6c.pdf", "author_site": "Yuanbang Liang, Jing Wu, Yu-Kun Lai, Yipeng Qin", "author": "Yuanbang Liang; Jing Wu; Yu-Kun Lai; Yipeng Qin", "abstract": "Despite the extensive studies on Generative Adversarial Networks (GANs), how to reliably sample high-quality images from their latent spaces remains an under-explored topic. In this paper, we propose a novel GAN latent sampling method by exploring and exploiting the hubness priors of GAN latent distributions. Our key insight is that the high dimensionality of the GAN latent space will inevitably lead to the emergence of hub latents that usually have much larger sampling densities than other latents in the latent space. As a result, these hub latents are better trained and thus contribute more to the synthesis of high-quality images. Unlike the a posterior \"cherry-picking\", our method is highly efficient as it is an a priori method that identifies high-quality latents before the synthesis of images. Furthermore, we show that the well-known but purely empirical truncation trick is a naive approximation to the central clustering effect of hub latents, which not only uncovers the rationale of the truncation trick, but also indicates the superiority and fundamentality of our method. Extensive experimental results demonstrate the effectiveness of the proposed method. Our code is available at: https://github.com/Byronliang8/HubnessGANSampling.", "bibtex": "@InProceedings{pmlr-v162-liang22b,\n title = \t {Exploring and Exploiting Hubness Priors for High-Quality {GAN} Latent Sampling},\n author = {Liang, Yuanbang and Wu, Jing and Lai, Yu-Kun and Qin, Yipeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13271--13284},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liang22b/liang22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/liang22b.html},\n abstract = \t {Despite the extensive studies on Generative Adversarial Networks (GANs), how to reliably sample high-quality images from their latent spaces remains an under-explored topic. In this paper, we propose a novel GAN latent sampling method by exploring and exploiting the hubness priors of GAN latent distributions. Our key insight is that the high dimensionality of the GAN latent space will inevitably lead to the emergence of hub latents that usually have much larger sampling densities than other latents in the latent space. As a result, these hub latents are better trained and thus contribute more to the synthesis of high-quality images. Unlike the a posterior \"cherry-picking\", our method is highly efficient as it is an a priori method that identifies high-quality latents before the synthesis of images. Furthermore, we show that the well-known but purely empirical truncation trick is a naive approximation to the central clustering effect of hub latents, which not only uncovers the rationale of the truncation trick, but also indicates the superiority and fundamentality of our method. Extensive experimental results demonstrate the effectiveness of the proposed method. Our code is available at: https://github.com/Byronliang8/HubnessGANSampling.}\n}", "pdf": "https://proceedings.mlr.press/v162/liang22b/liang22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/liang22b-supp.zip", "pdf_size": 35404209, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12825471375795704979&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "School of Computer Science and Informatics, Cardiff University; School of Computer Science and Informatics, Cardiff University; School of Computer Science and Informatics, Cardiff University; School of Computer Science and Informatics, Cardiff University", "aff_domain": "cardiff.ac.uk; ; ;cardiff.ac.uk", "email": "cardiff.ac.uk; ; ;cardiff.ac.uk", "github": "https://github.com/Byronliang8/HubnessGANSampling", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/liang22b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Cardiff University", "aff_unique_dep": "School of Computer Science and Informatics", "aff_unique_url": "https://www.cardiff.ac.uk", "aff_unique_abbr": "Cardiff", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cardiff", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Exploring the Gap between Collapsed & Whitened Features in Self-Supervised Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18293", "id": "18293", "proceeding": "https://proceedings.mlr.press/v162/he22c.html", "poster": "/media/PosterPDFs/ICML%202022/9778d5d219c5080b9a6a17bef029331c.png?t=1657841858.3992035", "slides": "", "author_site": "Bobby He, Mete Ozay", "author": "Bobby He; Mete Ozay", "abstract": "Avoiding feature collapse, when a Neural Network (NN) encoder maps all inputs to a constant vector, is a shared implicit desideratum of various methodological advances in self-supervised learning (SSL). To that end, whitened features have been proposed as an explicit objective to ensure uncollapsed features \\cite{zbontar2021barlow,ermolov2021whitening,hua2021feature,bardes2022vicreg}. We identify power law behaviour in eigenvalue decay, parameterised by exponent $\\beta{\\geq}0$, as a spectrum that bridges between the collapsed & whitened feature extremes. We provide theoretical & empirical evidence highlighting the factors in SSL, like projection layers & regularisation strength, that influence eigenvalue decay rate, & demonstrate that the degree of feature whitening affects generalisation, particularly in label scarce regimes. We use our insights to motivate a novel method, PMP (PostMan-Pat), which efficiently post-processes a pretrained encoder to enforce eigenvalue decay rate with power law exponent $\\beta$, & find that PostMan-Pat delivers improved label efficiency and transferability across a range of SSL methods and encoder architectures.", "bibtex": "@InProceedings{pmlr-v162-he22c,\n title = \t {Exploring the Gap between Collapsed & Whitened Features in Self-Supervised Learning},\n author = {He, Bobby and Ozay, Mete},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8613--8634},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/he22c/he22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/he22c.html},\n abstract = \t {Avoiding feature collapse, when a Neural Network (NN) encoder maps all inputs to a constant vector, is a shared implicit desideratum of various methodological advances in self-supervised learning (SSL). To that end, whitened features have been proposed as an explicit objective to ensure uncollapsed features \\cite{zbontar2021barlow,ermolov2021whitening,hua2021feature,bardes2022vicreg}. We identify power law behaviour in eigenvalue decay, parameterised by exponent $\\beta{\\geq}0$, as a spectrum that bridges between the collapsed & whitened feature extremes. We provide theoretical & empirical evidence highlighting the factors in SSL, like projection layers & regularisation strength, that influence eigenvalue decay rate, & demonstrate that the degree of feature whitening affects generalisation, particularly in label scarce regimes. We use our insights to motivate a novel method, PMP (PostMan-Pat), which efficiently post-processes a pretrained encoder to enforce eigenvalue decay rate with power law exponent $\\beta$, & find that PostMan-Pat delivers improved label efficiency and transferability across a range of SSL methods and encoder architectures.}\n}", "pdf": "https://proceedings.mlr.press/v162/he22c/he22c.pdf", "supp": "", "pdf_size": 1196204, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17464977335505627300&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "University of Oxford + Samsung Research UK; Samsung Research UK", "aff_domain": "stats.ox.ac.uk; ", "email": "stats.ox.ac.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/he22c.html", "aff_unique_index": "0+1;1", "aff_unique_norm": "University of Oxford;Samsung", "aff_unique_dep": ";Samsung Research UK", "aff_unique_url": "https://www.ox.ac.uk;https://www.samsung.com/uk/research/", "aff_unique_abbr": "Oxford;SRUK", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0", "aff_country_unique": "United Kingdom" }, { "title": "Expression might be enough: representing pressure and demand for reinforcement learning based traffic signal control", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16173", "id": "16173", "proceeding": "https://proceedings.mlr.press/v162/zhang22ah.html", "poster": "/media/PosterPDFs/ICML%202022/01e9565cecc4e989123f9620c1d09c09.png?t=1656934106.3217115", "slides": "/media/icml-2022/Slides/16173_DZJI60v.pdf", "author_site": "Liang Zhang, Qiang Wu, Jun Shen, Linyuan L\u00fc, Bo Du, Jianqing Wu", "author": "Liang Zhang; Qiang Wu; Jun Shen; Linyuan L\u00fc; Bo Du; Jianqing Wu", "abstract": "Many studies confirmed that a proper traffic state representation is more important than complex algorithms for the classical traffic signal control (TSC) problem. In this paper, we (1) present a novel, flexible and efficient method, namely advanced max pressure (Advanced-MP), taking both running and queuing vehicles into consideration to decide whether to change current signal phase; (2) inventively design the traffic movement representation with the efficient pressure and effective running vehicles from Advanced-MP, namely advanced traffic state (ATS); and (3) develop a reinforcement learning (RL) based algorithm template, called Advanced-XLight, by combining ATS with the latest RL approaches, and generate two RL algorithms, namely \"Advanced-MPLight\" and \"Advanced-CoLight\" from Advanced-XLight. Comprehensive experiments on multiple real-world datasets show that: (1) the Advanced-MP outperforms baseline methods, and it is also efficient and reliable for deployment; and (2) Advanced-MPLight and Advanced-CoLight can achieve the state-of-the-art.", "bibtex": "@InProceedings{pmlr-v162-zhang22ah,\n title = \t {Expression might be enough: representing pressure and demand for reinforcement learning based traffic signal control},\n author = {Zhang, Liang and Wu, Qiang and Shen, Jun and L{\\\"u}, Linyuan and Du, Bo and Wu, Jianqing},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26645--26654},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22ah/zhang22ah.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22ah.html},\n abstract = \t {Many studies confirmed that a proper traffic state representation is more important than complex algorithms for the classical traffic signal control (TSC) problem. In this paper, we (1) present a novel, flexible and efficient method, namely advanced max pressure (Advanced-MP), taking both running and queuing vehicles into consideration to decide whether to change current signal phase; (2) inventively design the traffic movement representation with the efficient pressure and effective running vehicles from Advanced-MP, namely advanced traffic state (ATS); and (3) develop a reinforcement learning (RL) based algorithm template, called Advanced-XLight, by combining ATS with the latest RL approaches, and generate two RL algorithms, namely \"Advanced-MPLight\" and \"Advanced-CoLight\" from Advanced-XLight. Comprehensive experiments on multiple real-world datasets show that: (1) the Advanced-MP outperforms baseline methods, and it is also efficient and reliable for deployment; and (2) Advanced-MPLight and Advanced-CoLight can achieve the state-of-the-art.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22ah/zhang22ah.pdf", "supp": "", "pdf_size": 1449007, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=995321608406249380&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Life Sciences, Lanzhou University; Institute of Fundamental and Frontier Sciences, University of Electronic Science and Technology of China; School of Computing and Information Technology, University of Wollongong; Institute of Fundamental and Frontier Sciences, University of Electronic Science and Technology of China + SMART Infrastructure Facility, University of Wollongong; SMART Infrastructure Facility, University of Wollongong; School of Information Engineering, Jiangxi University of Science and Technology", "aff_domain": "uestc.edu.cn; ; ; ; ; ", "email": "uestc.edu.cn; ; ; ; ; ", "github": "https://github.com/LiangZhang1996/Advancd XLight1", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhang22ah.html", "aff_unique_index": "0;1;2;1+2;2;3", "aff_unique_norm": "Lanzhou University;University of Electronic Science and Technology of China;University of Wollongong;Jiangxi University of Science and Technology", "aff_unique_dep": "School of Life Sciences;Institute of Fundamental and Frontier Sciences;School of Computing and Information Technology;School of Information Engineering", "aff_unique_url": "http://www.lzu.edu.cn;https://www.uestc.edu.cn;https://www.uow.edu.au;http://www.jxust.edu.cn", "aff_unique_abbr": ";UESTC;UOW;", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Wollongong", "aff_country_unique_index": "0;0;1;0+1;1;0", "aff_country_unique": "China;Australia" }, { "title": "Extended Unconstrained Features Model for Exploring Deep Neural Collapse", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18157", "id": "18157", "proceeding": "https://proceedings.mlr.press/v162/tirer22a.html", "poster": "/media/PosterPDFs/ICML%202022/d556b595eef47660153b8eddebbc2b12.png?t=1657588025.595031", "slides": "", "author_site": "Tom Tirer, Joan Bruna", "author": "Tom Tirer; Joan Bruna", "abstract": "The modern strategy for training deep neural networks for classification tasks includes optimizing the network\u2019s weights even after the training error vanishes to further push the training loss toward zero. Recently, a phenomenon termed \u201cneural collapse\" (NC) has been empirically observed in this training procedure. Specifically, it has been shown that the learned features (the output of the penultimate layer) of within-class samples converge to their mean, and the means of different classes exhibit a certain tight frame structure, which is also aligned with the last layer\u2019s weights. Recent papers have shown that minimizers with this structure emerge when optimizing a simplified \u201cunconstrained features model\" (UFM) with a regularized cross-entropy loss. In this paper, we further analyze and extend the UFM. First, we study the UFM for the regularized MSE loss, and show that the minimizers\u2019 features can have a more delicate structure than in the cross-entropy case. This affects also the structure of the weights. Then, we extend the UFM by adding another layer of weights as well as ReLU nonlinearity to the model and generalize our previous results. Finally, we empirically demonstrate the usefulness of our nonlinear extended UFM in modeling the NC phenomenon that occurs with practical networks.", "bibtex": "@InProceedings{pmlr-v162-tirer22a,\n title = \t {Extended Unconstrained Features Model for Exploring Deep Neural Collapse},\n author = {Tirer, Tom and Bruna, Joan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21478--21505},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tirer22a/tirer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tirer22a.html},\n abstract = \t {The modern strategy for training deep neural networks for classification tasks includes optimizing the network\u2019s weights even after the training error vanishes to further push the training loss toward zero. Recently, a phenomenon termed \u201cneural collapse\" (NC) has been empirically observed in this training procedure. Specifically, it has been shown that the learned features (the output of the penultimate layer) of within-class samples converge to their mean, and the means of different classes exhibit a certain tight frame structure, which is also aligned with the last layer\u2019s weights. Recent papers have shown that minimizers with this structure emerge when optimizing a simplified \u201cunconstrained features model\" (UFM) with a regularized cross-entropy loss. In this paper, we further analyze and extend the UFM. First, we study the UFM for the regularized MSE loss, and show that the minimizers\u2019 features can have a more delicate structure than in the cross-entropy case. This affects also the structure of the weights. Then, we extend the UFM by adding another layer of weights as well as ReLU nonlinearity to the model and generalize our previous results. Finally, we empirically demonstrate the usefulness of our nonlinear extended UFM in modeling the NC phenomenon that occurs with practical networks.}\n}", "pdf": "https://proceedings.mlr.press/v162/tirer22a/tirer22a.pdf", "supp": "", "pdf_size": 1700745, "gs_citation": 111, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16014861150953748675&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Center for Data Science, New York University, New York + Courant Institute of Mathematical Sciences, New York University, New York; Center for Data Science, New York University, New York + Courant Institute of Mathematical Sciences, New York University, New York", "aff_domain": "gmail.com; ", "email": "gmail.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/tirer22a.html", "aff_unique_index": "0+0;0+0", "aff_unique_norm": "New York University", "aff_unique_dep": "Center for Data Science", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "New York", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "United States" }, { "title": "Extracting Latent State Representations with Linear Dynamics from Rich Observations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16997", "id": "16997", "proceeding": "https://proceedings.mlr.press/v162/frandsen22a.html", "poster": "/media/PosterPDFs/ICML%202022/0d98b597aa732aea606bde680c3b57d8.png?t=1658286603.3965456", "slides": "", "author_site": "Abraham Frandsen, Rong Ge, Holden Lee", "author": "Abraham Frandsen; Rong Ge; Holden Lee", "abstract": "Recently, many reinforcement learning techniques have been shown to have provable guarantees in the simple case of linear dynamics, especially in problems like linear quadratic regulators. However, in practice many tasks require learning a policy from rich, high-dimensional features such as images, which are unlikely to be linear. We consider a setting where there is a hidden linear subspace of the high-dimensional feature space in which the dynamics are linear. We design natural objectives based on forward and inverse dynamics models. We prove that these objectives can be efficiently optimized and their local optimizers extract the hidden linear subspace. We empirically verify our theoretical results with synthetic data and explore the effectiveness of our approach (generalized to nonlinear settings) in simple control tasks with rich observations.", "bibtex": "@InProceedings{pmlr-v162-frandsen22a,\n title = \t {Extracting Latent State Representations with Linear Dynamics from Rich Observations},\n author = {Frandsen, Abraham and Ge, Rong and Lee, Holden},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6705--6725},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/frandsen22a/frandsen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/frandsen22a.html},\n abstract = \t {Recently, many reinforcement learning techniques have been shown to have provable guarantees in the simple case of linear dynamics, especially in problems like linear quadratic regulators. However, in practice many tasks require learning a policy from rich, high-dimensional features such as images, which are unlikely to be linear. We consider a setting where there is a hidden linear subspace of the high-dimensional feature space in which the dynamics are linear. We design natural objectives based on forward and inverse dynamics models. We prove that these objectives can be efficiently optimized and their local optimizers extract the hidden linear subspace. We empirically verify our theoretical results with synthetic data and explore the effectiveness of our approach (generalized to nonlinear settings) in simple control tasks with rich observations.}\n}", "pdf": "https://proceedings.mlr.press/v162/frandsen22a/frandsen22a.pdf", "supp": "", "pdf_size": 509994, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17615097086264711198&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Duke University, Durham, North Carolina, USA+Department of Mathematics, Duke University, Durham, North Carolina, USA; Department of Computer Science, Duke University, Durham, North Carolina, USA; Department of Mathematics, Duke University, Durham, North Carolina, USA", "aff_domain": "cs.duke.edu;cs.duke.edu;duke.edu", "email": "cs.duke.edu;cs.duke.edu;duke.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/frandsen22a.html", "aff_unique_index": "0+0;0;0", "aff_unique_norm": "Duke University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.duke.edu", "aff_unique_abbr": "Duke", "aff_campus_unique_index": "0+0;0;0", "aff_campus_unique": "Durham", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "FEDformer: Frequency Enhanced Decomposed Transformer for Long-term Series Forecasting", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17985", "id": "17985", "proceeding": "https://proceedings.mlr.press/v162/zhou22g.html", "poster": "", "slides": "", "author_site": "Tian Zhou, Ziqing MA, Qingsong Wen, Xue Wang, Liang Sun, rong jin", "author": "Tian Zhou; Ziqing Ma; Qingsong Wen; Xue Wang; Liang Sun; Rong Jin", "abstract": "Long-term time series forecasting is challenging since prediction accuracy tends to decrease dramatically with the increasing horizon. Although Transformer-based methods have significantly improved state-of-the-art results for long-term forecasting, they are not only computationally expensive but more importantly, are unable to capture the global view of time series (e.g. overall trend). To address these problems, we propose to combine Transformer with the seasonal-trend decomposition method, in which the decomposition method captures the global profile of time series while Transformers capture more detailed structures. To further enhance the performance of Transformer for long-term prediction, we exploit the fact that most time series tend to have a sparse representation in a well-known basis such as Fourier transform, and develop a frequency enhanced Transformer. Besides being more effective, the proposed method, termed as Frequency Enhanced Decomposed Transformer (FEDformer), is more efficient than standard Transformer with a linear complexity to the sequence length. Our empirical studies with six benchmark datasets show that compared with state-of-the-art methods, Fedformer can reduce prediction error by 14.8% and 22.6% for multivariate and univariate time series, respectively. Code is publicly available at https://github.com/MAZiqing/FEDformer.", "bibtex": "@InProceedings{pmlr-v162-zhou22g,\n title = \t {{FED}former: Frequency Enhanced Decomposed Transformer for Long-term Series Forecasting},\n author = {Zhou, Tian and Ma, Ziqing and Wen, Qingsong and Wang, Xue and Sun, Liang and Jin, Rong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27268--27286},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22g/zhou22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22g.html},\n abstract = \t {Long-term time series forecasting is challenging since prediction accuracy tends to decrease dramatically with the increasing horizon. Although Transformer-based methods have significantly improved state-of-the-art results for long-term forecasting, they are not only computationally expensive but more importantly, are unable to capture the global view of time series (e.g. overall trend). To address these problems, we propose to combine Transformer with the seasonal-trend decomposition method, in which the decomposition method captures the global profile of time series while Transformers capture more detailed structures. To further enhance the performance of Transformer for long-term prediction, we exploit the fact that most time series tend to have a sparse representation in a well-known basis such as Fourier transform, and develop a frequency enhanced Transformer. Besides being more effective, the proposed method, termed as Frequency Enhanced Decomposed Transformer (FEDformer), is more efficient than standard Transformer with a linear complexity to the sequence length. Our empirical studies with six benchmark datasets show that compared with state-of-the-art methods, Fedformer can reduce prediction error by 14.8% and 22.6% for multivariate and univariate time series, respectively. Code is publicly available at https://github.com/MAZiqing/FEDformer.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22g/zhou22g.pdf", "supp": "", "pdf_size": 743725, "gs_citation": 2128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=447506194635826863&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "DAMO Academy, Alibaba Group, Hangzhou, China; DAMO Academy, Alibaba Group, Hangzhou, China; DAMO Academy, Alibaba Group, Bellevue, USA; DAMO Academy, Alibaba Group, Bellevue, USA; DAMO Academy, Alibaba Group, Bellevue, USA; DAMO Academy, Alibaba Group, Bellevue, USA", "aff_domain": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "email": "alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com;alibaba-inc.com", "github": "https://github.com/MAZiqing/FEDformer", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhou22g.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "DAMO Academy", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "0;0;1;1;1;1", "aff_campus_unique": "Hangzhou;Bellevue", "aff_country_unique_index": "0;0;1;1;1;1", "aff_country_unique": "China;United States" }, { "title": "FITNESS: (Fine Tune on New and Similar Samples) to detect anomalies in streams with drift and outliers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17183", "id": "17183", "proceeding": "https://proceedings.mlr.press/v162/sankararaman22a.html", "poster": "/media/PosterPDFs/ICML%202022/486fbd761bfa5400722324fdc9822adc.png?t=1658106365.7302089", "slides": "", "author_site": "Abishek Sankararaman, Balakrishnan Narayanaswamy, Vikramank Singh, Zhao Song", "author": "Abishek Sankararaman; Balakrishnan Narayanaswamy; Vikramank Y Singh; Zhao Song", "abstract": "Technology improvements have made it easier than ever to collect diverse telemetry at high resolution from any cyber or physical system, for both monitoring and control. In the domain of monitoring, anomaly detection has become an important problem in many research areas ranging from IoT and sensor networks to devOps. These systems operate in real, noisy and non-stationary environments. A fundamental question is then, \u2018", "bibtex": "@InProceedings{pmlr-v162-sankararaman22a,\n title = \t {{FITNESS}: ({F}ine Tune on New and Similar Samples) to detect anomalies in streams with drift and outliers},\n author = {Sankararaman, Abishek and Narayanaswamy, Balakrishnan and Singh, Vikramank Y and Song, Zhao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19153--19177},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sankararaman22a/sankararaman22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sankararaman22a.html},\n abstract = \t {Technology improvements have made it easier than ever to collect diverse telemetry at high resolution from any cyber or physical system, for both monitoring and control. In the domain of monitoring, anomaly detection has become an important problem in many research areas ranging from IoT and sensor networks to devOps. These systems operate in real, noisy and non-stationary environments. A fundamental question is then, \u2018", "pdf": "https://proceedings.mlr.press/v162/sankararaman22a/sankararaman22a.pdf", "supp": "", "pdf_size": 1883521, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3069516619028436267&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Amazon AWS AI Labs, Santa Clara, CA; Amazon AWS AI Labs, Santa Clara, CA; Amazon AWS AI Labs, Santa Clara, CA; Amazon AWS AI Labs, Santa Clara, CA", "aff_domain": "amazon.com; ; ; ", "email": "amazon.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/sankararaman22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Amazon", "aff_unique_dep": "AI Labs", "aff_unique_url": "https://aws.amazon.com", "aff_unique_abbr": "Amazon AWS AI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Santa Clara", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "FOCUS: Familiar Objects in Common and Uncommon Settings", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17805", "id": "17805", "proceeding": "https://proceedings.mlr.press/v162/kattakinda22a.html", "poster": "", "slides": "", "author_site": "Priyatham Kattakinda, Soheil Feizi", "author": "Priyatham Kattakinda; Soheil Feizi", "abstract": "Standard training datasets for deep learning often do not contain objects in uncommon and rare settings (e.g., \u201ca plane on water\u201d, \u201ca car in snowy weather\u201d). This can cause models trained on these datasets to incorrectly predict objects that are typical for the context in the image, rather than identifying the objects that are actually present. In this paper, we introduce FOCUS (Familiar Objects in Common and Uncommon Settings), a dataset for stress-testing the generalization power of deep image classifiers. By leveraging the power of modern search engines, we deliberately gather data containing objects in common and uncommon settings; in a wide range of locations, weather conditions, and time of day. We present a detailed analysis of the performance of various popular image classifiers on our dataset and demonstrate a clear drop in accuracy when classifying images in uncommon settings. We also show that finetuning a model on our dataset drastically improves its ability to focus on the object of interest leading to better generalization. Lastly, we leverage FOCUS to machine annotate additional visual attributes for the entirety of ImageNet. We believe that our dataset will aid researchers in understanding the inability of deep models to generalize well to uncommon settings and drive future work on improving their distributional robustness.", "bibtex": "@InProceedings{pmlr-v162-kattakinda22a,\n title = \t {{FOCUS}: Familiar Objects in Common and Uncommon Settings},\n author = {Kattakinda, Priyatham and Feizi, Soheil},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10825--10847},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kattakinda22a/kattakinda22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kattakinda22a.html},\n abstract = \t {Standard training datasets for deep learning often do not contain objects in uncommon and rare settings (e.g., \u201ca plane on water\u201d, \u201ca car in snowy weather\u201d). This can cause models trained on these datasets to incorrectly predict objects that are typical for the context in the image, rather than identifying the objects that are actually present. In this paper, we introduce FOCUS (Familiar Objects in Common and Uncommon Settings), a dataset for stress-testing the generalization power of deep image classifiers. By leveraging the power of modern search engines, we deliberately gather data containing objects in common and uncommon settings; in a wide range of locations, weather conditions, and time of day. We present a detailed analysis of the performance of various popular image classifiers on our dataset and demonstrate a clear drop in accuracy when classifying images in uncommon settings. We also show that finetuning a model on our dataset drastically improves its ability to focus on the object of interest leading to better generalization. Lastly, we leverage FOCUS to machine annotate additional visual attributes for the entirety of ImageNet. We believe that our dataset will aid researchers in understanding the inability of deep models to generalize well to uncommon settings and drive future work on improving their distributional robustness.}\n}", "pdf": "https://proceedings.mlr.press/v162/kattakinda22a/kattakinda22a.pdf", "supp": "", "pdf_size": 9468380, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2485805129814216346&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Maryland, College Park, MD, USA; University of Maryland, College Park, MD, USA", "aff_domain": "umd.edu; ", "email": "umd.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/kattakinda22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Failure and success of the spectral bias prediction for Laplace Kernel Ridge Regression: the case of low-dimensional data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16237", "id": "16237", "proceeding": "https://proceedings.mlr.press/v162/tomasini22a.html", "poster": "/media/PosterPDFs/ICML%202022/8deb8d1dd92840f975b6931ab3a3c61e.png?t=1657617493.3893983", "slides": "", "author_site": "Umberto M. Tomasini, Antonio Sclocchi, Matthieu Wyart", "author": "Umberto M Tomasini; Antonio Sclocchi; Matthieu Wyart", "abstract": "Recently, several theories including the replica method made predictions for the generalization error of Kernel Ridge Regression. In some regimes, they predict that the method has a \u2018spectral bias\u2019: decomposing the true function $f^*$ on the eigenbasis of the kernel, it fits well the coefficients associated with the O(P) largest eigenvalues, where $P$ is the size of the training set. This prediction works very well on benchmark data sets such as images, yet the assumptions these approaches make on the data are never satisfied in practice. To clarify when the spectral bias prediction holds, we first focus on a one-dimensional model where rigorous results are obtained and then use scaling arguments to generalize and test our findings in higher dimensions. Our predictions include the classification case $f(x)=$sign$(x_1)$ with a data distribution that vanishes at the decision boundary $p(x)\\sim x_1^{\\chi}$. For $\\chi>0$ and a Laplace kernel, we find that (i) there exists a cross-over ridge $\\lambda^*_{d,\\chi}(P)\\sim P^{-\\frac{1}{d+\\chi}}$ such that for $\\lambda\\gg \\lambda^*_{d,\\chi}(P)$, the replica method applies, but not for $\\lambda\\ll\\lambda^*_{d,\\chi}(P)$, (ii) in the ridge-less case, spectral bias predicts the correct training curve exponent only in the limit $d\\rightarrow\\infty$.", "bibtex": "@InProceedings{pmlr-v162-tomasini22a,\n title = \t {Failure and success of the spectral bias prediction for {L}aplace Kernel Ridge Regression: the case of low-dimensional data},\n author = {Tomasini, Umberto M and Sclocchi, Antonio and Wyart, Matthieu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21548--21583},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tomasini22a/tomasini22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tomasini22a.html},\n abstract = \t {Recently, several theories including the replica method made predictions for the generalization error of Kernel Ridge Regression. In some regimes, they predict that the method has a \u2018spectral bias\u2019: decomposing the true function $f^*$ on the eigenbasis of the kernel, it fits well the coefficients associated with the O(P) largest eigenvalues, where $P$ is the size of the training set. This prediction works very well on benchmark data sets such as images, yet the assumptions these approaches make on the data are never satisfied in practice. To clarify when the spectral bias prediction holds, we first focus on a one-dimensional model where rigorous results are obtained and then use scaling arguments to generalize and test our findings in higher dimensions. Our predictions include the classification case $f(x)=$sign$(x_1)$ with a data distribution that vanishes at the decision boundary $p(x)\\sim x_1^{\\chi}$. For $\\chi>0$ and a Laplace kernel, we find that (i) there exists a cross-over ridge $\\lambda^*_{d,\\chi}(P)\\sim P^{-\\frac{1}{d+\\chi}}$ such that for $\\lambda\\gg \\lambda^*_{d,\\chi}(P)$, the replica method applies, but not for $\\lambda\\ll\\lambda^*_{d,\\chi}(P)$, (ii) in the ridge-less case, spectral bias predicts the correct training curve exponent only in the limit $d\\rightarrow\\infty$.}\n}", "pdf": "https://proceedings.mlr.press/v162/tomasini22a/tomasini22a.pdf", "supp": "", "pdf_size": 856916, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=767450053352216293&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Physics, EPFL, Lausanne, Switzerland; Department of Physics, EPFL, Lausanne, Switzerland; Department of Physics, EPFL, Lausanne, Switzerland", "aff_domain": "epfl.ch; ; ", "email": "epfl.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/tomasini22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "Department of Physics", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Fair Generalized Linear Models with a Convex Penalty", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17051", "id": "17051", "proceeding": "https://proceedings.mlr.press/v162/do22a.html", "poster": "/media/PosterPDFs/ICML%202022/0f21f0349462cacdc5796990d37760ae.png?t=1657920988.161742", "slides": "", "author_site": "Hyungrok Do, Preston Putzel, Axel Martin, Padhraic Smyth, Judy Zhong", "author": "Hyungrok Do; Preston Putzel; Axel S Martin; Padhraic Smyth; Judy Zhong", "abstract": "Despite recent advances in algorithmic fairness, methodologies for achieving fairness with generalized linear models (GLMs) have yet to be explored in general, despite GLMs being widely used in practice. In this paper we introduce two fairness criteria for GLMs based on equalizing expected outcomes or log-likelihoods. We prove that for GLMs both criteria can be achieved via a convex penalty term based solely on the linear components of the GLM, thus permitting efficient optimization. We also derive theoretical properties for the resulting fair GLM estimator. To empirically demonstrate the efficacy of the proposed fair GLM, we compare it with other well-known fair prediction methods on an extensive set of benchmark datasets for binary classification and regression. In addition, we demonstrate that the fair GLM can generate fair predictions for a range of response variables, other than binary and continuous outcomes.", "bibtex": "@InProceedings{pmlr-v162-do22a,\n title = \t {Fair Generalized Linear Models with a Convex Penalty},\n author = {Do, Hyungrok and Putzel, Preston and Martin, Axel S and Smyth, Padhraic and Zhong, Judy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5286--5308},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/do22a/do22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/do22a.html},\n abstract = \t {Despite recent advances in algorithmic fairness, methodologies for achieving fairness with generalized linear models (GLMs) have yet to be explored in general, despite GLMs being widely used in practice. In this paper we introduce two fairness criteria for GLMs based on equalizing expected outcomes or log-likelihoods. We prove that for GLMs both criteria can be achieved via a convex penalty term based solely on the linear components of the GLM, thus permitting efficient optimization. We also derive theoretical properties for the resulting fair GLM estimator. To empirically demonstrate the efficacy of the proposed fair GLM, we compare it with other well-known fair prediction methods on an extensive set of benchmark datasets for binary classification and regression. In addition, we demonstrate that the fair GLM can generate fair predictions for a range of response variables, other than binary and continuous outcomes.}\n}", "pdf": "https://proceedings.mlr.press/v162/do22a/do22a.pdf", "supp": "", "pdf_size": 1228957, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11693304205339987181&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Population Health, NYU Grossman School of Medicine, New York, NY, USA+Department of Computer Science, University of California, Irvine, CA, USA; Department of Computer Science, University of California, Irvine, CA, USA; Department of Population Health, NYU Grossman School of Medicine, New York, NY, USA; Department of Computer Science, University of California, Irvine, CA, USA; Department of Population Health, NYU Grossman School of Medicine, New York, NY, USA", "aff_domain": "nyulangone.org;uci.edu;nyulangone.org;uci.edu;nyulangone.org", "email": "nyulangone.org;uci.edu;nyulangone.org;uci.edu;nyulangone.org", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/do22a.html", "aff_unique_index": "0+1;1;0;1;0", "aff_unique_norm": "NYU Grossman School of Medicine;University of California, Irvine", "aff_unique_dep": "Department of Population Health;Department of Computer Science", "aff_unique_url": "https://med.nyu.edu;https://www.uci.edu", "aff_unique_abbr": "NYU Grossman SOM;UCI", "aff_campus_unique_index": "0+1;1;0;1;0", "aff_campus_unique": "New York;Irvine", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fair Representation Learning through Implicit Path Alignment", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16477", "id": "16477", "proceeding": "https://proceedings.mlr.press/v162/shui22a.html", "poster": "/media/PosterPDFs/ICML%202022/f6e794a75c5d51de081dbefa224304f9.png?t=1657205225.8781743", "slides": "/media/icml-2022/Slides/16477.pdf", "author_site": "Changjian Shui, Qi CHEN, Jiaqi Li, Boyu Wang, Christian Gagne", "author": "Changjian Shui; Qi Chen; Jiaqi Li; Boyu Wang; Christian Gagn\u00e9", "abstract": "We consider a fair representation learning perspective, where optimal predictors, on top of the data representation, are ensured to be invariant with respect to different sub-groups. Specifically, we formulate this intuition as a bi-level optimization, where the representation is learned in the outer-loop, and invariant optimal group predictors are updated in the inner-loop. Moreover, the proposed bi-level objective is demonstrated to fulfill the sufficiency rule, which is desirable in various practical scenarios but was not commonly studied in the fair learning. Besides, to avoid the high computational and memory cost of differentiating in the inner-loop of bi-level objective, we propose an implicit path alignment algorithm, which only relies on the solution of inner optimization and the implicit differentiation rather than the exact optimization path. We further analyze the error gap of the implicit approach and empirically validate the proposed method in both classification and regression settings. Experimental results show the consistently better trade-off in prediction performance and fairness measurement.", "bibtex": "@InProceedings{pmlr-v162-shui22a,\n title = \t {Fair Representation Learning through Implicit Path Alignment},\n author = {Shui, Changjian and Chen, Qi and Li, Jiaqi and Wang, Boyu and Gagn{\\'e}, Christian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20156--20175},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shui22a/shui22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/shui22a.html},\n abstract = \t {We consider a fair representation learning perspective, where optimal predictors, on top of the data representation, are ensured to be invariant with respect to different sub-groups. Specifically, we formulate this intuition as a bi-level optimization, where the representation is learned in the outer-loop, and invariant optimal group predictors are updated in the inner-loop. Moreover, the proposed bi-level objective is demonstrated to fulfill the sufficiency rule, which is desirable in various practical scenarios but was not commonly studied in the fair learning. Besides, to avoid the high computational and memory cost of differentiating in the inner-loop of bi-level objective, we propose an implicit path alignment algorithm, which only relies on the solution of inner optimization and the implicit differentiation rather than the exact optimization path. We further analyze the error gap of the implicit approach and empirically validate the proposed method in both classification and regression settings. Experimental results show the consistently better trade-off in prediction performance and fairness measurement.}\n}", "pdf": "https://proceedings.mlr.press/v162/shui22a/shui22a.pdf", "supp": "", "pdf_size": 555554, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8629565957981360081&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Universit\u00e9 Laval, Qu\u00e9bec, Canada; Universit\u00e9 Laval, Qu\u00e9bec, Canada; University of Western Ontario, Ontario, Canada; University of Western Ontario, Ontario, Canada; Universit\u00e9 Laval, Qu\u00e9bec, Canada + Canada CIFAR AI Chair, Mila", "aff_domain": "ulaval.ca;ulaval.ca;uwo.ca;csd.uwo.ca;gel.ulaval.ca", "email": "ulaval.ca;ulaval.ca;uwo.ca;csd.uwo.ca;gel.ulaval.ca", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/shui22a.html", "aff_unique_index": "0;0;1;1;0+2", "aff_unique_norm": "Universit\u00e9 Laval;University of Western Ontario;Mila", "aff_unique_dep": ";;Canada CIFAR AI Chair", "aff_unique_url": "https://www.ulaval.ca;https://www.uwo.ca;https://mila.quebec", "aff_unique_abbr": "ULaval;UWO;Mila", "aff_campus_unique_index": "0;0;1;1;0", "aff_campus_unique": "Qu\u00e9bec;Ontario;", "aff_country_unique_index": "0;0;0;0;0+0", "aff_country_unique": "Canada" }, { "title": "Fair and Fast k-Center Clustering for Data Summarization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17569", "id": "17569", "proceeding": "https://proceedings.mlr.press/v162/angelidakis22a.html", "poster": "/media/PosterPDFs/ICML%202022/594ca7adb3277c51a998252e2d4c906e_iJpD5WW.png?t=1658712381.019607", "slides": "/media/icml-2022/Slides/17569.pdf", "author_site": "Haris Angelidakis, Adam Kurpisz, Leon Sering, Rico Zenklusen", "author": "Haris Angelidakis; Adam Kurpisz; Leon Sering; Rico Zenklusen", "abstract": "We consider two key issues faced by many clustering methods when used for data summarization, namely (a) an unfair representation of \"demographic groups\u201d and (b) distorted summarizations, where data points in the summary represent subsets of the original data of vastly different sizes. Previous work made important steps towards handling separately each of these two issues in the context of the fundamental k-Center clustering objective through the study of fast algorithms for natural models that address them. We show that it is possible to effectively address both (a) and (b) simultaneously by presenting a clustering procedure that works for a canonical combined model and (i) is fast, both in theory and practice, (ii) exhibits a worst-case constant-factor guarantee, and (iii) gives promising computational results showing that there can be significant benefits in addressing both issues together instead of sequentially.", "bibtex": "@InProceedings{pmlr-v162-angelidakis22a,\n title = \t {Fair and Fast k-Center Clustering for Data Summarization},\n author = {Angelidakis, Haris and Kurpisz, Adam and Sering, Leon and Zenklusen, Rico},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {669--702},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/angelidakis22a/angelidakis22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/angelidakis22a.html},\n abstract = \t {We consider two key issues faced by many clustering methods when used for data summarization, namely (a) an unfair representation of \"demographic groups\u201d and (b) distorted summarizations, where data points in the summary represent subsets of the original data of vastly different sizes. Previous work made important steps towards handling separately each of these two issues in the context of the fundamental k-Center clustering objective through the study of fast algorithms for natural models that address them. We show that it is possible to effectively address both (a) and (b) simultaneously by presenting a clustering procedure that works for a canonical combined model and (i) is fast, both in theory and practice, (ii) exhibits a worst-case constant-factor guarantee, and (iii) gives promising computational results showing that there can be significant benefits in addressing both issues together instead of sequentially.}\n}", "pdf": "https://proceedings.mlr.press/v162/angelidakis22a/angelidakis22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/angelidakis22a-supp.zip", "pdf_size": 1037631, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=998456530749474270&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "CoW Protocol; Department of Mathematics, ETH Zurich, Zurich, Switzerland; Department of Mathematics, ETH Zurich, Zurich, Switzerland; Department of Mathematics, ETH Zurich, Zurich, Switzerland", "aff_domain": "ifor.math.ethz.ch; ; ; ", "email": "ifor.math.ethz.ch; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/angelidakis22a.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "CoW Protocol;ETH Zurich", "aff_unique_dep": ";Department of Mathematics", "aff_unique_url": ";https://www.ethz.ch", "aff_unique_abbr": ";ETHZ", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "1;1;1", "aff_country_unique": ";Switzerland" }, { "title": "Fairness Interventions as (Dis)Incentives for Strategic Manipulation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15987", "id": "15987", "proceeding": "https://proceedings.mlr.press/v162/zhang22l.html", "poster": "/media/PosterPDFs/ICML%202022/afb79a9be5cd9762572a008088d3153e.png?t=1657200037.7036238", "slides": "", "author_site": "Xueru Zhang, Mahdi Khalili, Kun Jin, Parinaz Naghizadeh, Mingyan Liu", "author": "Xueru Zhang; Mohammad Mahdi Khalili; Kun Jin; Parinaz Naghizadeh; Mingyan Liu", "abstract": "Although machine learning (ML) algorithms are widely used to make decisions about individuals in various domains, concerns have arisen that (1) these algorithms are vulnerable to strategic manipulation and \"gaming the algorithm\"; and (2) ML decisions may exhibit bias against certain social groups. Existing works have largely examined these as two separate issues, e.g., by focusing on building ML algorithms robust to strategic manipulation, or on training a fair ML algorithm. In this study, we set out to understand the impact they each have on the other, and examine how to characterize fair policies in the presence of strategic behavior. The strategic interaction between a decision maker and individuals (as decision takers) is modeled as a two-stage (Stackelberg) game; when designing an algorithm, the former anticipates the latter may manipulate their features in order to receive more favorable decisions. We analytically characterize the equilibrium strategies of both, and examine how the algorithms and their resulting fairness properties are affected when the decision maker is strategic (anticipates manipulation), as well as the impact of fairness interventions on equilibrium strategies. In particular, we identify conditions under which anticipation of strategic behavior may mitigate/exacerbate unfairness, and conditions under which fairness interventions can serve as (dis)incentives for strategic manipulation.", "bibtex": "@InProceedings{pmlr-v162-zhang22l,\n title = \t {Fairness Interventions as ({D}is){I}ncentives for Strategic Manipulation},\n author = {Zhang, Xueru and Khalili, Mohammad Mahdi and Jin, Kun and Naghizadeh, Parinaz and Liu, Mingyan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26239--26264},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22l/zhang22l.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22l.html},\n abstract = \t {Although machine learning (ML) algorithms are widely used to make decisions about individuals in various domains, concerns have arisen that (1) these algorithms are vulnerable to strategic manipulation and \"gaming the algorithm\"; and (2) ML decisions may exhibit bias against certain social groups. Existing works have largely examined these as two separate issues, e.g., by focusing on building ML algorithms robust to strategic manipulation, or on training a fair ML algorithm. In this study, we set out to understand the impact they each have on the other, and examine how to characterize fair policies in the presence of strategic behavior. The strategic interaction between a decision maker and individuals (as decision takers) is modeled as a two-stage (Stackelberg) game; when designing an algorithm, the former anticipates the latter may manipulate their features in order to receive more favorable decisions. We analytically characterize the equilibrium strategies of both, and examine how the algorithms and their resulting fairness properties are affected when the decision maker is strategic (anticipates manipulation), as well as the impact of fairness interventions on equilibrium strategies. In particular, we identify conditions under which anticipation of strategic behavior may mitigate/exacerbate unfairness, and conditions under which fairness interventions can serve as (dis)incentives for strategic manipulation.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22l/zhang22l.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/zhang22l-supp.zip", "pdf_size": 3192904, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12079165339924882330&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zhang22l.html" }, { "title": "Fairness with Adaptive Weights", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17015", "id": "17015", "proceeding": "https://proceedings.mlr.press/v162/chai22a.html", "poster": "/media/PosterPDFs/ICML%202022/0fe473396242072e84af286632d3f0ff.png?t=1658077566.564322", "slides": "", "author_site": "Junyi Chai, Xiaoqian Wang", "author": "Junyi Chai; Xiaoqian Wang", "abstract": "Fairness is now an important issue in machine learning. There are arising concerns that automated decision-making systems reflect real-world biases. Although a wide range of fairness-related methods have been proposed in recent years, the under-representation problem has been less studied. Due to the uneven distribution of samples from different populations, machine learning models tend to be biased against minority groups when trained by minimizing the average empirical risk across all samples. In this paper, we propose a novel adaptive reweighing method to address representation bias. The goal of our method is to achieve group-level balance among different demographic groups by learning adaptive weights for each sample. Our approach emphasizes more on error-prone samples in prediction and enhances adequate representation of minority groups for fairness. We derive a closed-form solution for adaptive weight assignment and propose an efficient algorithm with theoretical convergence guarantees. We theoretically analyze the fairness of our model and empirically verify that our method strikes a balance between fairness and accuracy. In experiments, our method achieves comparable or better performance than state-of-the-art methods in both classification and regression tasks. Furthermore, our method exhibits robustness to label noise on various benchmark datasets.", "bibtex": "@InProceedings{pmlr-v162-chai22a,\n title = \t {Fairness with Adaptive Weights},\n author = {Chai, Junyi and Wang, Xiaoqian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2853--2866},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chai22a/chai22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chai22a.html},\n abstract = \t {Fairness is now an important issue in machine learning. There are arising concerns that automated decision-making systems reflect real-world biases. Although a wide range of fairness-related methods have been proposed in recent years, the under-representation problem has been less studied. Due to the uneven distribution of samples from different populations, machine learning models tend to be biased against minority groups when trained by minimizing the average empirical risk across all samples. In this paper, we propose a novel adaptive reweighing method to address representation bias. The goal of our method is to achieve group-level balance among different demographic groups by learning adaptive weights for each sample. Our approach emphasizes more on error-prone samples in prediction and enhances adequate representation of minority groups for fairness. We derive a closed-form solution for adaptive weight assignment and propose an efficient algorithm with theoretical convergence guarantees. We theoretically analyze the fairness of our model and empirically verify that our method strikes a balance between fairness and accuracy. In experiments, our method achieves comparable or better performance than state-of-the-art methods in both classification and regression tasks. Furthermore, our method exhibits robustness to label noise on various benchmark datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/chai22a/chai22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chai22a-supp.zip", "pdf_size": 725580, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12148538549036250753&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Elmore Family School of Electrical and Computer Engineering, Purdue University; Elmore Family School of Electrical and Computer Engineering, Purdue University", "aff_domain": "purdue.edu;purdue.edu", "email": "purdue.edu;purdue.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/chai22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "Elmore Family School of Electrical and Computer Engineering", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Fast Aquatic Swimmer Optimization with Differentiable Projective Dynamics and Neural Network Hydrodynamic Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18075", "id": "18075", "proceeding": "https://proceedings.mlr.press/v162/nava22a.html", "poster": "/media/PosterPDFs/ICML%202022/680390c55bbd9ce416d1d69a9ab4760d.png?t=1657700442.925447", "slides": "", "author_site": "Elvis Nava, John Zhang, Mike Yan Michelis, Tao Du, Pingchuan Ma, Benjamin F. Grewe, Wojciech Matusik, Robert Katzschmann", "author": "Elvis Nava; John Z Zhang; Mike Yan Michelis; Tao Du; Pingchuan Ma; Benjamin F. Grewe; Wojciech Matusik; Robert Kevin Katzschmann", "abstract": "Aquatic locomotion is a classic fluid-structure interaction (FSI) problem of interest to biologists and engineers. Solving the fully coupled FSI equations for incompressible Navier-Stokes and finite elasticity is computationally expensive. Optimizing robotic swimmer design within such a system generally involves cumbersome, gradient-free procedures on top of the already costly simulation. To address this challenge we present a novel, fully differentiable hybrid approach to FSI that combines a 2D direct numerical simulation for the deformable solid structure of the swimmer and a physics-constrained neural network surrogate to capture hydrodynamic effects of the fluid. For the deformable solid simulation of the swimmer\u2019s body, we use state-of-the-art techniques from the field of computer graphics to speed up the finite-element method (FEM). For the fluid simulation, we use a U-Net architecture trained with a physics-based loss function to predict the flow field at each time step. The pressure and velocity field outputs from the neural network are sampled around the boundary of our swimmer using an immersed boundary method (IBM) to compute its swimming motion accurately and efficiently. We demonstrate the computational efficiency and differentiability of our hybrid simulator on a 2D carangiform swimmer. Due to differentiability, the simulator can be used for computational design of controls for soft bodies immersed in fluids via direct gradient-based optimization.", "bibtex": "@InProceedings{pmlr-v162-nava22a,\n title = \t {Fast Aquatic Swimmer Optimization with Differentiable Projective Dynamics and Neural Network Hydrodynamic Models},\n author = {Nava, Elvis and Zhang, John Z and Michelis, Mike Yan and Du, Tao and Ma, Pingchuan and Grewe, Benjamin F. and Matusik, Wojciech and Katzschmann, Robert Kevin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16413--16427},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nava22a/nava22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nava22a.html},\n abstract = \t {Aquatic locomotion is a classic fluid-structure interaction (FSI) problem of interest to biologists and engineers. Solving the fully coupled FSI equations for incompressible Navier-Stokes and finite elasticity is computationally expensive. Optimizing robotic swimmer design within such a system generally involves cumbersome, gradient-free procedures on top of the already costly simulation. To address this challenge we present a novel, fully differentiable hybrid approach to FSI that combines a 2D direct numerical simulation for the deformable solid structure of the swimmer and a physics-constrained neural network surrogate to capture hydrodynamic effects of the fluid. For the deformable solid simulation of the swimmer\u2019s body, we use state-of-the-art techniques from the field of computer graphics to speed up the finite-element method (FEM). For the fluid simulation, we use a U-Net architecture trained with a physics-based loss function to predict the flow field at each time step. The pressure and velocity field outputs from the neural network are sampled around the boundary of our swimmer using an immersed boundary method (IBM) to compute its swimming motion accurately and efficiently. We demonstrate the computational efficiency and differentiability of our hybrid simulator on a 2D carangiform swimmer. Due to differentiability, the simulator can be used for computational design of controls for soft bodies immersed in fluids via direct gradient-based optimization.}\n}", "pdf": "https://proceedings.mlr.press/v162/nava22a/nava22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/nava22a-supp.zip", "pdf_size": 1229105, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17136288889741995441&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "email": ";;;;;;;", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/nava22a.html" }, { "title": "Fast Composite Optimization and Statistical Recovery in Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17581", "id": "17581", "proceeding": "https://proceedings.mlr.press/v162/bao22b.html", "poster": "/media/PosterPDFs/ICML%202022/4c144c47ecba6f8318128703ca9e2601.png?t=1656994416.4009972", "slides": "", "author_site": "Yajie Bao, Michael Crawshaw, Shan Luo, Mingrui Liu", "author": "Yajie Bao; Michael Crawshaw; Shan Luo; Mingrui Liu", "abstract": "As a prevalent distributed learning paradigm, Federated Learning (FL) trains a global model on a massive amount of devices with infrequent communication. This paper investigates a class of composite optimization and statistical recovery problems in the FL setting, whose loss function consists of a data-dependent smooth loss and a non-smooth regularizer. Examples include sparse linear regression using Lasso, low-rank matrix recovery using nuclear norm regularization, etc. In the existing literature, federated composite optimization algorithms are designed only from an optimization perspective without any statistical guarantees. In addition, they do not consider commonly used (restricted) strong convexity in statistical recovery problems. We advance the frontiers of this problem from both optimization and statistical perspectives. From optimization upfront, we propose a new algorithm named", "bibtex": "@InProceedings{pmlr-v162-bao22b,\n title = \t {Fast Composite Optimization and Statistical Recovery in Federated Learning},\n author = {Bao, Yajie and Crawshaw, Michael and Luo, Shan and Liu, Mingrui},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1508--1536},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bao22b/bao22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/bao22b.html},\n abstract = \t {As a prevalent distributed learning paradigm, Federated Learning (FL) trains a global model on a massive amount of devices with infrequent communication. This paper investigates a class of composite optimization and statistical recovery problems in the FL setting, whose loss function consists of a data-dependent smooth loss and a non-smooth regularizer. Examples include sparse linear regression using Lasso, low-rank matrix recovery using nuclear norm regularization, etc. In the existing literature, federated composite optimization algorithms are designed only from an optimization perspective without any statistical guarantees. In addition, they do not consider commonly used (restricted) strong convexity in statistical recovery problems. We advance the frontiers of this problem from both optimization and statistical perspectives. From optimization upfront, we propose a new algorithm named", "pdf": "https://proceedings.mlr.press/v162/bao22b/bao22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/bao22b-supp.zip", "pdf_size": 865830, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16221559265108390939&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Mathematical Sciences, Shanghai Jiao Tong University, Shanghai, China; Department of Computer Science, George Mason University, Fairfax, VA 22030, USA; School of Mathematical Sciences, Shanghai Jiao Tong University, Shanghai, China; Department of Computer Science, George Mason University, Fairfax, VA 22030, USA", "aff_domain": "gmu.edu; ; ;gmu.edu", "email": "gmu.edu; ; ;gmu.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/bao22b.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;George Mason University", "aff_unique_dep": "School of Mathematical Sciences;Department of Computer Science", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.gmu.edu", "aff_unique_abbr": "SJTU;GMU", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "Shanghai;Fairfax", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "China;United States" }, { "title": "Fast Convex Optimization for Two-Layer ReLU Networks: Equivalent Model Classes and Cone Decompositions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16549", "id": "16549", "proceeding": "https://proceedings.mlr.press/v162/mishkin22a.html", "poster": "/media/PosterPDFs/ICML%202022/0e274e1d1a8948f16f0227e4ec1965a8.png?t=1657908574.70692", "slides": "", "author_site": "Aaron Mishkin, Arda Sahiner, Mert Pilanci", "author": "Aaron Mishkin; Arda Sahiner; Mert Pilanci", "abstract": "We develop fast algorithms and robust software for convex optimization of two-layer neural networks with ReLU activation functions. Our work leverages a convex re-formulation of the standard weight-decay penalized training problem as a set of group-l1-regularized data-local models, where locality is enforced by polyhedral cone constraints. In the special case of zero-regularization, we show that this problem is exactly equivalent to unconstrained optimization of a convex \"gated ReLU\" network. For problems with non-zero regularization, we show that convex gated ReLU models obtain data-dependent approximation bounds for the ReLU training problem. To optimize the convex re-formulations, we develop an accelerated proximal gradient method and a practical augmented Lagrangian solver. We show that these approaches are faster than standard training heuristics for the non-convex problem, such as SGD, and outperform commercial interior-point solvers. Experimentally, we verify our theoretical results, explore the group-l1 regularization path, and scale convex optimization for neural networks to image classification on MNIST and CIFAR-10.", "bibtex": "@InProceedings{pmlr-v162-mishkin22a,\n title = \t {Fast Convex Optimization for Two-Layer {R}e{LU} Networks: Equivalent Model Classes and Cone Decompositions},\n author = {Mishkin, Aaron and Sahiner, Arda and Pilanci, Mert},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15770--15816},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mishkin22a/mishkin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mishkin22a.html},\n abstract = \t {We develop fast algorithms and robust software for convex optimization of two-layer neural networks with ReLU activation functions. Our work leverages a convex re-formulation of the standard weight-decay penalized training problem as a set of group-l1-regularized data-local models, where locality is enforced by polyhedral cone constraints. In the special case of zero-regularization, we show that this problem is exactly equivalent to unconstrained optimization of a convex \"gated ReLU\" network. For problems with non-zero regularization, we show that convex gated ReLU models obtain data-dependent approximation bounds for the ReLU training problem. To optimize the convex re-formulations, we develop an accelerated proximal gradient method and a practical augmented Lagrangian solver. We show that these approaches are faster than standard training heuristics for the non-convex problem, such as SGD, and outperform commercial interior-point solvers. Experimentally, we verify our theoretical results, explore the group-l1 regularization path, and scale convex optimization for neural networks to image classification on MNIST and CIFAR-10.}\n}", "pdf": "https://proceedings.mlr.press/v162/mishkin22a/mishkin22a.pdf", "supp": "", "pdf_size": 2562110, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7077031077028119954&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, Stanford University; Department of Electrical Engineering, Stanford University; Department of Electrical Engineering, Stanford University", "aff_domain": "cs.stanford.edu; ; ", "email": "cs.stanford.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/mishkin22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Fast Finite Width Neural Tangent Kernel", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17193", "id": "17193", "proceeding": "https://proceedings.mlr.press/v162/novak22a.html", "poster": "/media/PosterPDFs/ICML%202022/3483e5ec0489e5c394b028ec4e81f3e1.png?t=1657869665.1815343", "slides": "/media/icml-2022/Slides/17193.pdf", "author_site": "Roman Novak, Jascha Sohl-Dickstein, Samuel Schoenholz", "author": "Roman Novak; Jascha Sohl-Dickstein; Samuel S Schoenholz", "abstract": "The Neural Tangent Kernel (NTK), defined as the outer product of the neural network (NN) Jacobians, has emerged as a central object of study in deep learning. In the infinite width limit, the NTK can sometimes be computed analytically and is useful for understanding training and generalization of NN architectures. At finite widths, the NTK is also used to better initialize NNs, compare the conditioning across models, perform architecture search, and do meta-learning. Unfortunately, the finite width NTK is notoriously expensive to compute, which severely limits its practical utility. We perform the first in-depth analysis of the compute and memory requirements for NTK computation in finite width networks. Leveraging the structure of neural networks, we further propose two novel algorithms that change the exponent of the compute and memory requirements of the finite width NTK, dramatically improving efficiency. Our algorithms can be applied in a black box fashion to any differentiable function, including those implementing neural networks. We open-source our implementations within the Neural Tangents package at https://github.com/google/neural-tangents.", "bibtex": "@InProceedings{pmlr-v162-novak22a,\n title = \t {Fast Finite Width Neural Tangent Kernel},\n author = {Novak, Roman and Sohl-Dickstein, Jascha and Schoenholz, Samuel S},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17018--17044},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/novak22a/novak22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/novak22a.html},\n abstract = \t {The Neural Tangent Kernel (NTK), defined as the outer product of the neural network (NN) Jacobians, has emerged as a central object of study in deep learning. In the infinite width limit, the NTK can sometimes be computed analytically and is useful for understanding training and generalization of NN architectures. At finite widths, the NTK is also used to better initialize NNs, compare the conditioning across models, perform architecture search, and do meta-learning. Unfortunately, the finite width NTK is notoriously expensive to compute, which severely limits its practical utility. We perform the first in-depth analysis of the compute and memory requirements for NTK computation in finite width networks. Leveraging the structure of neural networks, we further propose two novel algorithms that change the exponent of the compute and memory requirements of the finite width NTK, dramatically improving efficiency. Our algorithms can be applied in a black box fashion to any differentiable function, including those implementing neural networks. We open-source our implementations within the Neural Tangents package at https://github.com/google/neural-tangents.}\n}", "pdf": "https://proceedings.mlr.press/v162/novak22a/novak22a.pdf", "supp": "", "pdf_size": 3431567, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2891750348147928089&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Google Brain, Mountain View, California, United States; Google Brain, Mountain View, California, United States; Google Brain, Mountain View, California, United States", "aff_domain": "google.com; ; ", "email": "google.com; ; ", "github": "github.com/google/neural-tangents", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/novak22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Brain", "aff_unique_url": "https://brain.google.com", "aff_unique_abbr": "Google Brain", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Fast Lossless Neural Compression with Integer-Only Discrete Flows", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17733", "id": "17733", "proceeding": "https://proceedings.mlr.press/v162/wang22a.html", "poster": "/media/PosterPDFs/ICML%202022/17257e81a344982579af1ae6415a7b8c.png?t=1656762533.5792477", "slides": "/media/icml-2022/Slides/17733_YDFcjCJ.pdf", "author_site": "Siyu Wang, Jianfei Chen, Chongxuan Li, Jun Zhu, Bo Zhang", "author": "Siyu Wang; Jianfei Chen; Chongxuan Li; Jun Zhu; Bo Zhang", "abstract": "By applying entropy codecs with learned data distributions, neural compressors have significantly outperformed traditional codecs in terms of compression ratio. However, the high inference latency of neural networks hinders the deployment of neural compressors in practical applications. In this work, we propose Integer-only Discrete Flows (IODF) an efficient neural compressor with integer-only arithmetic. Our work is built upon integer discrete flows, which consists of invertible transformations between discrete random variables. We propose efficient invertible transformations with integer-only arithmetic based on 8-bit quantization. Our invertible transformation is equipped with learnable binary gates to remove redundant filters during inference. We deploy IODF with TensorRT on GPUs, achieving $10\\times$ inference speedup compared to the fastest existing neural compressors, while retaining the high compression rates on ImageNet32 and ImageNet64.", "bibtex": "@InProceedings{pmlr-v162-wang22a,\n title = \t {Fast Lossless Neural Compression with Integer-Only Discrete Flows},\n author = {Wang, Siyu and Chen, Jianfei and Li, Chongxuan and Zhu, Jun and Zhang, Bo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22562--22575},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22a/wang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22a.html},\n abstract = \t {By applying entropy codecs with learned data distributions, neural compressors have significantly outperformed traditional codecs in terms of compression ratio. However, the high inference latency of neural networks hinders the deployment of neural compressors in practical applications. In this work, we propose Integer-only Discrete Flows (IODF) an efficient neural compressor with integer-only arithmetic. Our work is built upon integer discrete flows, which consists of invertible transformations between discrete random variables. We propose efficient invertible transformations with integer-only arithmetic based on 8-bit quantization. Our invertible transformation is equipped with learnable binary gates to remove redundant filters during inference. We deploy IODF with TensorRT on GPUs, achieving $10\\times$ inference speedup compared to the fastest existing neural compressors, while retaining the high compression rates on ImageNet32 and ImageNet64.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22a/wang22a.pdf", "supp": "", "pdf_size": 1545038, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9606476142959964204&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/wang22a.html" }, { "title": "Fast Population-Based Reinforcement Learning on a Single Machine", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18253", "id": "18253", "proceeding": "https://proceedings.mlr.press/v162/flajolet22a.html", "poster": "/media/PosterPDFs/ICML%202022/51425b752a0b402ed3effc83fc4bbb74_m0PoSUQ.png?t=1657748854.2794664", "slides": "", "author_site": "Arthur Flajolet, Claire Bizon Monroc, Karim Beguir, Thomas Pierrot", "author": "Arthur Flajolet; Claire Bizon Monroc; Karim Beguir; Thomas Pierrot", "abstract": "Training populations of agents has demonstrated great promise in Reinforcement Learning for stabilizing training, improving exploration and asymptotic performance, and generating a diverse set of solutions. However, population-based training is often not considered by practitioners as it is perceived to be either prohibitively slow (when implemented sequentially), or computationally expensive (if agents are trained in parallel on independent accelerators). In this work, we compare implementations and revisit previous studies to show that the judicious use of compilation and vectorization allows population-based training to be performed on a single machine with one accelerator with minimal overhead compared to training a single agent. We also show that, when provided with a few accelerators, our protocols extend to large population sizes for applications such as hyperparameter tuning. We hope that this work and the public release of our code will encourage practitioners to use population-based learning techniques more frequently for their research and applications.", "bibtex": "@InProceedings{pmlr-v162-flajolet22a,\n title = \t {Fast Population-Based Reinforcement Learning on a Single Machine},\n author = {Flajolet, Arthur and Monroc, Claire Bizon and Beguir, Karim and Pierrot, Thomas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6533--6547},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/flajolet22a/flajolet22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/flajolet22a.html},\n abstract = \t {Training populations of agents has demonstrated great promise in Reinforcement Learning for stabilizing training, improving exploration and asymptotic performance, and generating a diverse set of solutions. However, population-based training is often not considered by practitioners as it is perceived to be either prohibitively slow (when implemented sequentially), or computationally expensive (if agents are trained in parallel on independent accelerators). In this work, we compare implementations and revisit previous studies to show that the judicious use of compilation and vectorization allows population-based training to be performed on a single machine with one accelerator with minimal overhead compared to training a single agent. We also show that, when provided with a few accelerators, our protocols extend to large population sizes for applications such as hyperparameter tuning. We hope that this work and the public release of our code will encourage practitioners to use population-based learning techniques more frequently for their research and applications.}\n}", "pdf": "https://proceedings.mlr.press/v162/flajolet22a/flajolet22a.pdf", "supp": "", "pdf_size": 586970, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1715165115891315467&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "InstaDeep Ltd.; InstaDeep Ltd.; InstaDeep Ltd.; InstaDeep Ltd.", "aff_domain": "instadeep.com; ; ;instadeep.com", "email": "instadeep.com; ; ;instadeep.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/flajolet22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "InstaDeep", "aff_unique_dep": "", "aff_unique_url": "https://www.instadeep.com", "aff_unique_abbr": "InstaDeep", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Fast Provably Robust Decision Trees and Boosting", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16127", "id": "16127", "proceeding": "https://proceedings.mlr.press/v162/guo22h.html", "poster": "/media/PosterPDFs/ICML%202022/2b6921f2c64dee16ba21ebf17f3c2c92.png?t=1657218599.0145986", "slides": "", "author_site": "Jun-Qi Guo, Ming-Zhuo Teng, Wei Gao, Zhi-Hua Zhou", "author": "Jun-Qi Guo; Ming-Zhuo Teng; Wei Gao; Zhi-Hua Zhou", "abstract": "Learning with adversarial robustness has been a challenge in contemporary machine learning, and recent years have witnessed increasing attention on robust decision trees and ensembles, mostly working with high computational complexity or without guarantees of provable robustness. This work proposes the Fast Provably Robust Decision Tree (FPRDT) with the smallest computational complexity O(n log n), a tradeoff between global and local optimizations over the adversarial 0/1 loss. We further develop the Provably Robust AdaBoost (PRAdaBoost) according to our robust decision trees, and present convergence analysis for training adversarial 0/1 loss. We conduct extensive experiments to support our approaches; in particular, our approaches are superior to those unprovably robust methods, and achieve better or comparable performance to those provably robust methods yet with the smallest running time.", "bibtex": "@InProceedings{pmlr-v162-guo22h,\n title = \t {Fast Provably Robust Decision Trees and Boosting},\n author = {Guo, Jun-Qi and Teng, Ming-Zhuo and Gao, Wei and Zhou, Zhi-Hua},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8127--8144},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guo22h/guo22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/guo22h.html},\n abstract = \t {Learning with adversarial robustness has been a challenge in contemporary machine learning, and recent years have witnessed increasing attention on robust decision trees and ensembles, mostly working with high computational complexity or without guarantees of provable robustness. This work proposes the Fast Provably Robust Decision Tree (FPRDT) with the smallest computational complexity O(n log n), a tradeoff between global and local optimizations over the adversarial 0/1 loss. We further develop the Provably Robust AdaBoost (PRAdaBoost) according to our robust decision trees, and present convergence analysis for training adversarial 0/1 loss. We conduct extensive experiments to support our approaches; in particular, our approaches are superior to those unprovably robust methods, and achieve better or comparable performance to those provably robust methods yet with the smallest running time.}\n}", "pdf": "https://proceedings.mlr.press/v162/guo22h/guo22h.pdf", "supp": "", "pdf_size": 945843, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=484250009437224820&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "aff": "National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China", "aff_domain": "nju.edu.cn; ; ; ", "email": "nju.edu.cn; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/guo22h.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "National Key Laboratory for Novel Software Technology", "aff_unique_url": "http://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Nanjing", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Fast Relative Entropy Coding with A* coding", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18213", "id": "18213", "proceeding": "https://proceedings.mlr.press/v162/flamich22a.html", "poster": "", "slides": "", "author_site": "Gergely Flamich, Stratis Markou, Jose Miguel Hernandez-Lobato", "author": "Gergely Flamich; Stratis Markou; Jose Miguel Hernandez-Lobato", "abstract": "Relative entropy coding (REC) algorithms encode a sample from a target distribution Q using a proposal distribution P, such that the expected codelength is O(KL[Q || P]). REC can be seamlessly integrated with existing learned compression models since, unlike entropy coding, it does not assume discrete Q or P, and does not require quantisation. However, general REC algorithms require an intractable $\\Omega$(exp(KL[Q || P])) runtime. We introduce AS* and AD* coding, two REC algorithms based on A* sampling. We prove that, for continuous distributions over the reals, if the density ratio is unimodal, AS* has O(D$\\infty$[Q || P]) expected runtime, where D$\\infty$[Q || P] is the Renyi $\\infty$-divergence. We provide experimental evidence that AD* also has O(D$\\infty$[Q || P]) expected runtime. We prove that AS* and AD* achieve an expected codelength of O(KL[Q || P]). Further, we introduce DAD*, an approximate algorithm based on AD* which retains its favourable runtime and has bias similar to that of alternative methods. Focusing on VAEs, we propose the IsoKL VAE (IKVAE), which can be used with DAD* to further improve compression efficiency. We evaluate A* coding with (IK)VAEs on MNIST, showing that it can losslessly compress images near the theoretically optimal limit.", "bibtex": "@InProceedings{pmlr-v162-flamich22a,\n title = \t {Fast Relative Entropy Coding with A* coding},\n author = {Flamich, Gergely and Markou, Stratis and Hernandez-Lobato, Jose Miguel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6548--6577},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/flamich22a/flamich22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/flamich22a.html},\n abstract = \t {Relative entropy coding (REC) algorithms encode a sample from a target distribution Q using a proposal distribution P, such that the expected codelength is O(KL[Q || P]). REC can be seamlessly integrated with existing learned compression models since, unlike entropy coding, it does not assume discrete Q or P, and does not require quantisation. However, general REC algorithms require an intractable $\\Omega$(exp(KL[Q || P])) runtime. We introduce AS* and AD* coding, two REC algorithms based on A* sampling. We prove that, for continuous distributions over the reals, if the density ratio is unimodal, AS* has O(D$\\infty$[Q || P]) expected runtime, where D$\\infty$[Q || P] is the Renyi $\\infty$-divergence. We provide experimental evidence that AD* also has O(D$\\infty$[Q || P]) expected runtime. We prove that AS* and AD* achieve an expected codelength of O(KL[Q || P]). Further, we introduce DAD*, an approximate algorithm based on AD* which retains its favourable runtime and has bias similar to that of alternative methods. Focusing on VAEs, we propose the IsoKL VAE (IKVAE), which can be used with DAD* to further improve compression efficiency. We evaluate A* coding with (IK)VAEs on MNIST, showing that it can losslessly compress images near the theoretically optimal limit.}\n}", "pdf": "https://proceedings.mlr.press/v162/flamich22a/flamich22a.pdf", "supp": "", "pdf_size": 1123453, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15067093838009838231&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Engineering, University of Cambridge, Cambridge, UK+Microsoft Research, Cambridge, UK+Alan Turing Institute, London, UK; Department of Engineering, University of Cambridge, Cambridge, UK+Microsoft Research, Cambridge, UK+Alan Turing Institute, London, UK; Department of Engineering, University of Cambridge, Cambridge, UK+Microsoft Research, Cambridge, UK+Alan Turing Institute, London, UK", "aff_domain": "cam.ac.uk;cam.ac.uk; ", "email": "cam.ac.uk;cam.ac.uk; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/flamich22a.html", "aff_unique_index": "0+1+2;0+1+2;0+1+2", "aff_unique_norm": "University of Cambridge;Microsoft;Alan Turing Institute", "aff_unique_dep": "Department of Engineering;Microsoft Research;", "aff_unique_url": "https://www.cam.ac.uk;https://www.microsoft.com/en-us/research;https://www.turing.ac.uk", "aff_unique_abbr": "Cambridge;MSR;ATI", "aff_campus_unique_index": "0+0+1;0+0+1;0+0+1", "aff_campus_unique": "Cambridge;London", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0", "aff_country_unique": "United Kingdom" }, { "title": "Fast and Provable Nonconvex Tensor RPCA", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17853", "id": "17853", "proceeding": "https://proceedings.mlr.press/v162/qiu22d.html", "poster": "/media/PosterPDFs/ICML%202022/46b2644cbdf489fac0e2d192212d206d.png?t=1657285339.718267", "slides": "", "author_site": "Haiquan Qiu, Yao Wang, Shaojie Tang, Deyu Meng, QUANMING YAO", "author": "Haiquan Qiu; Yao Wang; Shaojie Tang; Deyu Meng; Quanming Yao", "abstract": "In this paper, we study nonconvex tensor robust principal component analysis (RPCA) based on the $t$-SVD. We first propose an alternating projection method, i.e., APT, which converges linearly to the ground-truth under the incoherence conditions of tensors. However, as the projection to the low-rank tensor space in APT can be slow, we further propose to speedup such a process by utilizing the property of the tangent space of low-rank. The resulting algorithm, i.e., EAPT, is not only more efficient than APT but also keeps the linear convergence. Compared with existing tensor RPCA works, the proposed method, especially EAPT, is not only more effective due to the recovery guarantee and adaption in the transformed (frequency) domain but also more efficient due to faster convergence rate and lower iteration complexity. These benefits are also empirically verified both on synthetic data, and real applications, e.g., hyperspectral image denoising and video background subtraction.", "bibtex": "@InProceedings{pmlr-v162-qiu22d,\n title = \t {Fast and Provable Nonconvex Tensor {RPCA}},\n author = {Qiu, Haiquan and Wang, Yao and Tang, Shaojie and Meng, Deyu and Yao, Quanming},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18211--18249},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qiu22d/qiu22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/qiu22d.html},\n abstract = \t {In this paper, we study nonconvex tensor robust principal component analysis (RPCA) based on the $t$-SVD. We first propose an alternating projection method, i.e., APT, which converges linearly to the ground-truth under the incoherence conditions of tensors. However, as the projection to the low-rank tensor space in APT can be slow, we further propose to speedup such a process by utilizing the property of the tangent space of low-rank. The resulting algorithm, i.e., EAPT, is not only more efficient than APT but also keeps the linear convergence. Compared with existing tensor RPCA works, the proposed method, especially EAPT, is not only more effective due to the recovery guarantee and adaption in the transformed (frequency) domain but also more efficient due to faster convergence rate and lower iteration complexity. These benefits are also empirically verified both on synthetic data, and real applications, e.g., hyperspectral image denoising and video background subtraction.}\n}", "pdf": "https://proceedings.mlr.press/v162/qiu22d/qiu22d.pdf", "supp": "", "pdf_size": 1112022, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1610374978050737734&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Xi\u2019an Jiaotong University, Xi\u2019an, China+Tsinghua University, Beijing, China+Paradigm Inc., Beijing, China; Xi\u2019an Jiaotong University, Xi\u2019an, China; The University of Texas at Dallas, Texas, USA; Xi\u2019an Jiaotong University, Xi\u2019an, China+Macau University of Science and Technology, Macau, China; Tsinghua University, Beijing, China", "aff_domain": "gmail.com; ; ; ;connect.ust.hk", "email": "gmail.com; ; ; ;connect.ust.hk", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/qiu22d.html", "aff_unique_index": "0+1+2;0;3;0+4;1", "aff_unique_norm": "Xi'an Jiao Tong University;Tsinghua University;Paradigm Inc.;University of Texas at Dallas;Macau University of Science and Technology", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.tsinghua.edu.cn;;https://www.utdallas.edu;https://www.must.edu.mo", "aff_unique_abbr": "XJTU;THU;;UT Dallas;MUST", "aff_campus_unique_index": "0+1;0;3;0+4;1", "aff_campus_unique": "Xi'an;Beijing;;Dallas;Macau", "aff_country_unique_index": "0+0+0;0;1;0+0;0", "aff_country_unique": "China;United States" }, { "title": "Fast and Reliable Evaluation of Adversarial Robustness with Minimum-Margin Attack", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16827", "id": "16827", "proceeding": "https://proceedings.mlr.press/v162/gao22i.html", "poster": "/media/PosterPDFs/ICML%202022/030e65da2b1c944090548d36b244b28d.png?t=1657522074.4294252", "slides": "", "author_site": "Ruize Gao, Jiongxiao Wang, Kaiwen Zhou, Feng Liu, Binghui Xie, Gang Niu, Bo Han, James Cheng", "author": "Ruize Gao; Jiongxiao Wang; Kaiwen Zhou; Feng Liu; Binghui Xie; Gang Niu; Bo Han; James Cheng", "abstract": "The AutoAttack (AA) has been the most reliable method to evaluate adversarial robustness when considerable computational resources are available. However, the high computational cost (e.g., 100 times more than that of the project gradient descent attack) makes AA infeasible for practitioners with limited computational resources, and also hinders applications of AA in the adversarial training (AT). In this paper, we propose a novel method, minimum-margin (MM) attack, to fast and reliably evaluate adversarial robustness. Compared with AA, our method achieves comparable performance but only costs 3% of the computational time in extensive experiments. The reliability of our method lies in that we evaluate the quality of adversarial examples using the margin between two targets that can precisely identify the most adversarial example. The computational efficiency of our method lies in an effective Sequential TArget Ranking Selection (STARS) method, ensuring that the cost of the MM attack is independent of the number of classes. The MM attack opens a new way for evaluating adversarial robustness and provides a feasible and reliable way to generate high-quality adversarial examples in AT.", "bibtex": "@InProceedings{pmlr-v162-gao22i,\n title = \t {Fast and Reliable Evaluation of Adversarial Robustness with Minimum-Margin Attack},\n author = {Gao, Ruize and Wang, Jiongxiao and Zhou, Kaiwen and Liu, Feng and Xie, Binghui and Niu, Gang and Han, Bo and Cheng, James},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7144--7163},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gao22i/gao22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/gao22i.html},\n abstract = \t {The AutoAttack (AA) has been the most reliable method to evaluate adversarial robustness when considerable computational resources are available. However, the high computational cost (e.g., 100 times more than that of the project gradient descent attack) makes AA infeasible for practitioners with limited computational resources, and also hinders applications of AA in the adversarial training (AT). In this paper, we propose a novel method, minimum-margin (MM) attack, to fast and reliably evaluate adversarial robustness. Compared with AA, our method achieves comparable performance but only costs 3% of the computational time in extensive experiments. The reliability of our method lies in that we evaluate the quality of adversarial examples using the margin between two targets that can precisely identify the most adversarial example. The computational efficiency of our method lies in an effective Sequential TArget Ranking Selection (STARS) method, ensuring that the cost of the MM attack is independent of the number of classes. The MM attack opens a new way for evaluating adversarial robustness and provides a feasible and reliable way to generate high-quality adversarial examples in AT.}\n}", "pdf": "https://proceedings.mlr.press/v162/gao22i/gao22i.pdf", "supp": "", "pdf_size": 1796887, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16577119936016409064&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science and Engineering, The Chinese University of Hong Kong; Department of Computer Science and Engineering, The Chinese University of Hong Kong; Department of Computer Science and Engineering, The Chinese University of Hong Kong; School of Mathematics and Statistics, The University of Melbourne; Department of Computer Science and Engineering, The Chinese University of Hong Kong; RIKEN-AIP; Department of Computer Science, Hong Kong Baptist University; Department of Computer Science and Engineering, The Chinese University of Hong Kong", "aff_domain": "cse.cuhk.edu.hk;comp.hkbu.edu.hk; ; ; ; ; ;", "email": "cse.cuhk.edu.hk;comp.hkbu.edu.hk; ; ; ; ; ;", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/gao22i.html", "aff_unique_index": "0;0;0;1;0;2;3;0", "aff_unique_norm": "Chinese University of Hong Kong;University of Melbourne;RIKEN Center for Advanced Intelligence Project;Hong Kong Baptist University", "aff_unique_dep": "Department of Computer Science and Engineering;School of Mathematics and Statistics;Center for Advanced Intelligence Project;Department of Computer Science", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.unimelb.edu.au;https://aip.Riken.jp;https://www.hkbu.edu.hk", "aff_unique_abbr": "CUHK;UniMelb;RIKEN-AIP;HKBU", "aff_campus_unique_index": "0;0;0;1;0;0;0", "aff_campus_unique": "Hong Kong SAR;Melbourne;", "aff_country_unique_index": "0;0;0;1;0;2;0;0", "aff_country_unique": "China;Australia;Japan" }, { "title": "Fast rates for noisy interpolation require rethinking the effect of inductive bias", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17649", "id": "17649", "proceeding": "https://proceedings.mlr.press/v162/donhauser22a.html", "poster": "/media/PosterPDFs/ICML%202022/6e96be832cf8bc6b35a956e8fb66c76a.png?t=1658200145.2856305", "slides": "", "author_site": "Konstantin Donhauser, Nicol\u00f2 Ruggeri, Stefan Stojanovic, Fanny Yang", "author": "Konstantin Donhauser; Nicol\u00f2 Ruggeri; Stefan Stojanovic; Fanny Yang", "abstract": "Good generalization performance on high-dimensional data crucially hinges on a simple structure of the ground truth and a corresponding strong inductive bias of the estimator. Even though this intuition is valid for regularized models, in this paper we caution against a strong inductive bias for interpolation in the presence of noise: While a stronger inductive bias encourages a simpler structure that is more aligned with the ground truth, it also increases the detrimental effect of noise. Specifically, for both linear regression and classification with a sparse ground truth, we prove that minimum $\\ell_p$-norm and maximum $\\ell_p$-margin interpolators achieve fast polynomial rates close to order $1/n$ for $p > 1$ compared to a logarithmic rate for $p = 1$. Finally, we provide preliminary experimental evidence that this trade-off may also play a crucial role in understanding non-linear interpolating models used in practice.", "bibtex": "@InProceedings{pmlr-v162-donhauser22a,\n title = \t {Fast rates for noisy interpolation require rethinking the effect of inductive bias},\n author = {Donhauser, Konstantin and Ruggeri, Nicol{\\`o} and Stojanovic, Stefan and Yang, Fanny},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5397--5428},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/donhauser22a/donhauser22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/donhauser22a.html},\n abstract = \t {Good generalization performance on high-dimensional data crucially hinges on a simple structure of the ground truth and a corresponding strong inductive bias of the estimator. Even though this intuition is valid for regularized models, in this paper we caution against a strong inductive bias for interpolation in the presence of noise: While a stronger inductive bias encourages a simpler structure that is more aligned with the ground truth, it also increases the detrimental effect of noise. Specifically, for both linear regression and classification with a sparse ground truth, we prove that minimum $\\ell_p$-norm and maximum $\\ell_p$-margin interpolators achieve fast polynomial rates close to order $1/n$ for $p > 1$ compared to a logarithmic rate for $p = 1$. Finally, we provide preliminary experimental evidence that this trade-off may also play a crucial role in understanding non-linear interpolating models used in practice.}\n}", "pdf": "https://proceedings.mlr.press/v162/donhauser22a/donhauser22a.pdf", "supp": "", "pdf_size": 674635, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10985894687814782548&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "ETH Zurich, Department of Computer Science+ETH AI Center; ETH Zurich, Department of Computer Science+Max-Planck-Institute for Intelligent Systems, T\u00fcbingen, Germany; ETH Zurich; ETH Zurich, Department of Computer Science", "aff_domain": "ai.ethz.ch; ; ; ", "email": "ai.ethz.ch; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/donhauser22a.html", "aff_unique_index": "0+0;0+1;0;0", "aff_unique_norm": "ETH Zurich;Max-Planck-Institute for Intelligent Systems", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-is.mpg.de", "aff_unique_abbr": "ETHZ;MPI-IS", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Zurich;T\u00fcbingen", "aff_country_unique_index": "0+0;0+1;0;0", "aff_country_unique": "Switzerland;Germany" }, { "title": "Fast-Rate PAC-Bayesian Generalization Bounds for Meta-Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16025", "id": "16025", "proceeding": "https://proceedings.mlr.press/v162/guan22b.html", "poster": "/media/PosterPDFs/ICML%202022/3cef96dcc9b8035d23f69e30bb19218a.png?t=1656571814.78716", "slides": "", "author_site": "Jiechao Guan, Zhiwu Lu", "author": "Jiechao Guan; Zhiwu Lu", "abstract": "PAC-Bayesian error bounds provide a theoretical guarantee on the generalization abilities of meta-learning from training tasks to unseen tasks. However, it is still unclear how tight PAC-Bayesian bounds we can achieve for meta-learning. In this work, we propose a general PAC-Bayesian framework to cope with single-task learning and meta-learning uniformly. With this framework, we generalize the two tightest PAC-Bayesian bounds (i.e., kl-bound and Catoni-bound) from single-task learning to standard meta-learning, resulting in fast convergence rates for PAC-Bayesian meta-learners. By minimizing the derived two bounds, we develop two meta-learning algorithms for classification problems with deep neural networks. For regression problems, by setting Gibbs optimal posterior for each training task, we obtain the closed-form formula of the minimizer of our Catoni-bound, leading to an efficient Gibbs meta-learning algorithm. Although minimizing our kl-bound can not yield a closed-form solution, we show that it can be extended for analyzing the more challenging meta-learning setting where samples from different training tasks exhibit interdependencies. Experiments empirically show that our proposed meta-learning algorithms achieve competitive results with respect to latest works.", "bibtex": "@InProceedings{pmlr-v162-guan22b,\n title = \t {Fast-Rate {PAC}-{B}ayesian Generalization Bounds for Meta-Learning},\n author = {Guan, Jiechao and Lu, Zhiwu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7930--7948},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guan22b/guan22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/guan22b.html},\n abstract = \t {PAC-Bayesian error bounds provide a theoretical guarantee on the generalization abilities of meta-learning from training tasks to unseen tasks. However, it is still unclear how tight PAC-Bayesian bounds we can achieve for meta-learning. In this work, we propose a general PAC-Bayesian framework to cope with single-task learning and meta-learning uniformly. With this framework, we generalize the two tightest PAC-Bayesian bounds (i.e., kl-bound and Catoni-bound) from single-task learning to standard meta-learning, resulting in fast convergence rates for PAC-Bayesian meta-learners. By minimizing the derived two bounds, we develop two meta-learning algorithms for classification problems with deep neural networks. For regression problems, by setting Gibbs optimal posterior for each training task, we obtain the closed-form formula of the minimizer of our Catoni-bound, leading to an efficient Gibbs meta-learning algorithm. Although minimizing our kl-bound can not yield a closed-form solution, we show that it can be extended for analyzing the more challenging meta-learning setting where samples from different training tasks exhibit interdependencies. Experiments empirically show that our proposed meta-learning algorithms achieve competitive results with respect to latest works.}\n}", "pdf": "https://proceedings.mlr.press/v162/guan22b/guan22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/guan22b-supp.zip", "pdf_size": 7060351, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14212467314474916298&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "School of Information, Renmin University of China, Beijing, China+Beijing Key Laboratory of Big Data Management and Analysis Methods, Beijing, China; Gaoling School of Arti\ufb01cial Intelligence, Renmin University of China, Beijing, China+Beijing Key Laboratory of Big Data Management and Analysis Methods, Beijing, China", "aff_domain": "ruc.edu.cn;ruc.edu.cn", "email": "ruc.edu.cn;ruc.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/guan22b.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Renmin University of China;Beijing Key Laboratory of Big Data Management and Analysis Methods", "aff_unique_dep": "School of Information;Big Data Management and Analysis", "aff_unique_url": "http://www.ruc.edu.cn;", "aff_unique_abbr": "RUC;", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "China" }, { "title": "Faster Algorithms for Learning Convex Functions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16555", "id": "16555", "proceeding": "https://proceedings.mlr.press/v162/siahkamari22a.html", "poster": "/media/PosterPDFs/ICML%202022/d7a728a67d909e714c0774e22cb806f2.png?t=1658103121.3125315", "slides": "", "author_site": "Ali Siahkamari, Durmus Alp Emre Acar, Christopher Liao, Kelly Geyer, Venkatesh Saligrama, Brian Kulis", "author": "Ali Siahkamari; Durmus Alp Emre Acar; Christopher Liao; Kelly L Geyer; Venkatesh Saligrama; Brian Kulis", "abstract": "The task of approximating an arbitrary convex function arises in several learning problems such as convex regression, learning with a difference of convex (DC) functions, and learning Bregman or $f$-divergences. In this paper, we develop and analyze an approach for solving a broad range of convex function learning problems that is faster than state-of-the-art approaches. Our approach is based on a 2-block ADMM method where each block can be computed in closed form. For the task of convex Lipschitz regression, we establish that our proposed algorithm converges with iteration complexity of $ O(n\\sqrt{d}/\\epsilon)$ for a dataset $\\bm X \\in \\mathbb R^{n\\times d}$ and $\\epsilon > 0$. Combined with per-iteration computation complexity, our method converges with the rate $O(n^3 d^{1.5}/\\epsilon+n^2 d^{2.5}/\\epsilon+n d^3/\\epsilon)$. This new rate improves the state of the art rate of $O(n^5d^2/\\epsilon)$ if $d = o( n^4)$. Further we provide similar solvers for DC regression and Bregman divergence learning. Unlike previous approaches, our method is amenable to the use of GPUs. We demonstrate on regression and metric learning experiments that our approach is over 100 times faster than existing approaches on some data sets, and produces results that are comparable to state of the art.", "bibtex": "@InProceedings{pmlr-v162-siahkamari22a,\n title = \t {Faster Algorithms for Learning Convex Functions},\n author = {Siahkamari, Ali and Acar, Durmus Alp Emre and Liao, Christopher and Geyer, Kelly L and Saligrama, Venkatesh and Kulis, Brian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20176--20194},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/siahkamari22a/siahkamari22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/siahkamari22a.html},\n abstract = \t {The task of approximating an arbitrary convex function arises in several learning problems such as convex regression, learning with a difference of convex (DC) functions, and learning Bregman or $f$-divergences. In this paper, we develop and analyze an approach for solving a broad range of convex function learning problems that is faster than state-of-the-art approaches. Our approach is based on a 2-block ADMM method where each block can be computed in closed form. For the task of convex Lipschitz regression, we establish that our proposed algorithm converges with iteration complexity of $ O(n\\sqrt{d}/\\epsilon)$ for a dataset $\\bm X \\in \\mathbb R^{n\\times d}$ and $\\epsilon > 0$. Combined with per-iteration computation complexity, our method converges with the rate $O(n^3 d^{1.5}/\\epsilon+n^2 d^{2.5}/\\epsilon+n d^3/\\epsilon)$. This new rate improves the state of the art rate of $O(n^5d^2/\\epsilon)$ if $d = o( n^4)$. Further we provide similar solvers for DC regression and Bregman divergence learning. Unlike previous approaches, our method is amenable to the use of GPUs. We demonstrate on regression and metric learning experiments that our approach is over 100 times faster than existing approaches on some data sets, and produces results that are comparable to state of the art.}\n}", "pdf": "https://proceedings.mlr.press/v162/siahkamari22a/siahkamari22a.pdf", "supp": "", "pdf_size": 429493, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13666890982645264315&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Boston University; Boston University; Boston University; Boston University; Boston University; Boston University", "aff_domain": "bu.edu; ; ; ; ; ", "email": "bu.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/siahkamari22a.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Boston University", "aff_unique_dep": "", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Faster Fundamental Graph Algorithms via Learned Predictions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16397", "id": "16397", "proceeding": "https://proceedings.mlr.press/v162/chen22v.html", "poster": "/media/PosterPDFs/ICML%202022/4fac9ba115140ac4f1c22da82aa0bc7f.png?t=1658103825.581159", "slides": "/media/icml-2022/Slides/16397.pdf", "author_site": "Justin Chen, Sandeep Silwal, Ali Vakilian, Fred Zhang", "author": "Justin Chen; Sandeep Silwal; Ali Vakilian; Fred Zhang", "abstract": "We consider the question of speeding up classic graph algorithms with machine-learned predictions. In this model, algorithms are furnished with extra advice learned from past or similar instances. Given the additional information, we aim to improve upon the traditional worst-case run-time guarantees. Our contributions are the following: (i) We give a faster algorithm for minimum-weight bipartite matching via learned duals, improving the recent result by Dinitz, Im, Lavastida, Moseley and Vassilvitskii (NeurIPS, 2021); (ii) We extend the learned dual approach to the single-source shortest path problem (with negative edge lengths), achieving an almost linear runtime given sufficiently accurate predictions which improves upon the classic fastest algorithm due to Goldberg (SIAM J. Comput., 1995); (iii) We provide a general reduction-based framework for learning-based graph algorithms, leading to new algorithms for degree-constrained subgraph and minimum-cost 0-1 flow, based on reductions to bipartite matching and the shortest path problem. Finally, we give a set of general learnability theorems, showing that the predictions required by our algorithms can be efficiently learned in a PAC fashion.", "bibtex": "@InProceedings{pmlr-v162-chen22v,\n title = \t {Faster Fundamental Graph Algorithms via Learned Predictions},\n author = {Chen, Justin and Silwal, Sandeep and Vakilian, Ali and Zhang, Fred},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3583--3602},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22v/chen22v.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22v.html},\n abstract = \t {We consider the question of speeding up classic graph algorithms with machine-learned predictions. In this model, algorithms are furnished with extra advice learned from past or similar instances. Given the additional information, we aim to improve upon the traditional worst-case run-time guarantees. Our contributions are the following: (i) We give a faster algorithm for minimum-weight bipartite matching via learned duals, improving the recent result by Dinitz, Im, Lavastida, Moseley and Vassilvitskii (NeurIPS, 2021); (ii) We extend the learned dual approach to the single-source shortest path problem (with negative edge lengths), achieving an almost linear runtime given sufficiently accurate predictions which improves upon the classic fastest algorithm due to Goldberg (SIAM J. Comput., 1995); (iii) We provide a general reduction-based framework for learning-based graph algorithms, leading to new algorithms for degree-constrained subgraph and minimum-cost 0-1 flow, based on reductions to bipartite matching and the shortest path problem. Finally, we give a set of general learnability theorems, showing that the predictions required by our algorithms can be efficiently learned in a PAC fashion.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22v/chen22v.pdf", "supp": "", "pdf_size": 421624, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18345732880761146156&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "MIT; MIT; TTIC; UC Berkeley", "aff_domain": "berkeley.edu; ; ; ", "email": "berkeley.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/chen22v.html", "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Massachusetts Institute of Technology;Toyota Technological Institute at Chicago;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://www.ttic.edu;https://www.berkeley.edu", "aff_unique_abbr": "MIT;TTIC;UC Berkeley", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Chicago;Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Faster Privacy Accounting via Evolving Discretization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17465", "id": "17465", "proceeding": "https://proceedings.mlr.press/v162/ghazi22a.html", "poster": "/media/PosterPDFs/ICML%202022/881cb5534ac04cd691cdfa681afffb45.png?t=1658033037.720458", "slides": "", "author_site": "Badih Ghazi, Pritish Kamath, Ravi Kumar, Pasin Manurangsi", "author": "Badih Ghazi; Pritish Kamath; Ravi Kumar; Pasin Manurangsi", "abstract": "We introduce a new algorithm for numerical composition of privacy random variables, useful for computing the accurate differential privacy parameters for compositions of mechanisms. Our algorithm achieves a running time and memory usage of $polylog(k)$ for the task of self-composing a mechanism, from a broad class of mechanisms, $k$ times; this class, e.g., includes the sub-sampled Gaussian mechanism, that appears in the analysis of differentially private stochastic gradient descent (DP-SGD). By comparison, recent work by Gopi et al. (NeurIPS 2021) has obtained a running time of $\\widetilde{O}(\\sqrt{k})$ for the same task. Our approach extends to the case of composing $k$ different mechanisms in the same class, improving upon the running time and memory usage in their work from $\\widetilde{O}(k^{1.5})$ to $\\wtilde{O}(k)$.", "bibtex": "@InProceedings{pmlr-v162-ghazi22a,\n title = \t {Faster Privacy Accounting via Evolving Discretization},\n author = {Ghazi, Badih and Kamath, Pritish and Kumar, Ravi and Manurangsi, Pasin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7470--7483},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ghazi22a/ghazi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ghazi22a.html},\n abstract = \t {We introduce a new algorithm for numerical composition of privacy random variables, useful for computing the accurate differential privacy parameters for compositions of mechanisms. Our algorithm achieves a running time and memory usage of $polylog(k)$ for the task of self-composing a mechanism, from a broad class of mechanisms, $k$ times; this class, e.g., includes the sub-sampled Gaussian mechanism, that appears in the analysis of differentially private stochastic gradient descent (DP-SGD). By comparison, recent work by Gopi et al. (NeurIPS 2021) has obtained a running time of $\\widetilde{O}(\\sqrt{k})$ for the same task. Our approach extends to the case of composing $k$ different mechanisms in the same class, improving upon the running time and memory usage in their work from $\\widetilde{O}(k^{1.5})$ to $\\wtilde{O}(k)$.}\n}", "pdf": "https://proceedings.mlr.press/v162/ghazi22a/ghazi22a.pdf", "supp": "", "pdf_size": 1728582, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17293762714214781897&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Google Research, USA; Google Research, USA; Google Research, USA; Google Research, USA", "aff_domain": "alum.mit.edu;google.com; ; ", "email": "alum.mit.edu;google.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/ghazi22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fat\u2013Tailed Variational Inference with Anisotropic Tail Adaptive Flows", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16405", "id": "16405", "proceeding": "https://proceedings.mlr.press/v162/liang22a.html", "poster": "/media/PosterPDFs/ICML%202022/6a571fe98a2ba453e84923b447d79cff_Rdq34QO.png?t=1656085248.3338747", "slides": "/media/icml-2022/Slides/16405_J1RZmnx.pdf", "author_site": "Feynman Liang, Michael Mahoney, Liam Hodgkinson", "author": "Feynman Liang; Michael Mahoney; Liam Hodgkinson", "abstract": "While fat-tailed densities commonly arise as posterior and marginal distributions in robust models and scale mixtures, they present a problematic scenario when Gaussian-based variational inference fails to accurately capture tail decay. We first improve previous theory on tails of Lipschitz flows by quantifying how they affect the rate of tail decay and expanding the theory to non-Lipschitz polynomial flows. Next, we develop an alternative theory for multivariate tail parameters which is sensitive to tail-anisotropy. In doing so, we unveil a fundamental problem which plagues many existing flow-based methods: they can only model tail-isotropic distributions (i.e., distributions having the same tail parameter in every direction). To mitigate this and enable modeling of tail-anisotropic targets, we propose anisotropic tail-adaptive flows (ATAF). Experimental results confirm ATAF on both synthetic and real-world targets is competitive with prior work while also exhibiting appropriate tail-anisotropy.", "bibtex": "@InProceedings{pmlr-v162-liang22a,\n title = \t {{F}at{\u2013}{T}ailed Variational Inference with Anisotropic Tail Adaptive Flows},\n author = {Liang, Feynman and Mahoney, Michael and Hodgkinson, Liam},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13257--13270},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liang22a/liang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/liang22a.html},\n abstract = \t {While fat-tailed densities commonly arise as posterior and marginal distributions in robust models and scale mixtures, they present a problematic scenario when Gaussian-based variational inference fails to accurately capture tail decay. We first improve previous theory on tails of Lipschitz flows by quantifying how they affect the rate of tail decay and expanding the theory to non-Lipschitz polynomial flows. Next, we develop an alternative theory for multivariate tail parameters which is sensitive to tail-anisotropy. In doing so, we unveil a fundamental problem which plagues many existing flow-based methods: they can only model tail-isotropic distributions (i.e., distributions having the same tail parameter in every direction). To mitigate this and enable modeling of tail-anisotropic targets, we propose anisotropic tail-adaptive flows (ATAF). Experimental results confirm ATAF on both synthetic and real-world targets is competitive with prior work while also exhibiting appropriate tail-anisotropy.}\n}", "pdf": "https://proceedings.mlr.press/v162/liang22a/liang22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/liang22a-supp.zip", "pdf_size": 517136, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2834665906033401797&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Statistics, University of California, Berkeley, CA + Meta, Menlo Park, CA; Department of Statistics, University of California, Berkeley, CA; Department of Statistics, University of California, Berkeley, CA + International Computer Science Institute, Berkeley, CA", "aff_domain": "berkeley.edu; ; ", "email": "berkeley.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/liang22a.html", "aff_unique_index": "0+1;0;0+2", "aff_unique_norm": "University of California, Berkeley;Meta;International Computer Science Institute", "aff_unique_dep": "Department of Statistics;Meta Platforms, Inc.;", "aff_unique_url": "https://www.berkeley.edu;https://www.meta.com;https://www.icsi.berkeley.edu/", "aff_unique_abbr": "UC Berkeley;Meta;ICSI", "aff_campus_unique_index": "0+1;0;0+0", "aff_campus_unique": "Berkeley;Menlo Park", "aff_country_unique_index": "0+0;0;0+0", "aff_country_unique": "United States" }, { "title": "Feature Learning and Signal Propagation in Deep Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18009", "id": "18009", "proceeding": "https://proceedings.mlr.press/v162/lou22a.html", "poster": "/media/PosterPDFs/ICML%202022/8725fb777f25776ffa9076e44fcfd776.png?t=1656822826.5061216", "slides": "", "author_site": "Yizhang Lou, Chris Mingard, Soufiane Hayou", "author": "Yizhang Lou; Chris E Mingard; Soufiane Hayou", "abstract": "Recent work by Baratin et al. (2021) sheds light on an intriguing pattern that occurs during the training of deep neural networks: some layers align much more with data compared to other layers (where the alignment is defined as the normalize euclidean product of the tangent features matrix and the data labels matrix). The curve of the alignment as a function of layer index (generally) exhibits a ascent-descent pattern where the maximum is reached for some hidden layer. In this work, we provide the first explanation for this phenomenon. We introduce the Equilibrium Hypothesis which connects this alignment pattern to signal propagation in deep neural networks. Our experiments demonstrate an excellent match with the theoretical predictions.", "bibtex": "@InProceedings{pmlr-v162-lou22a,\n title = \t {Feature Learning and Signal Propagation in Deep Neural Networks},\n author = {Lou, Yizhang and Mingard, Chris E and Hayou, Soufiane},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14248--14282},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lou22a/lou22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lou22a.html},\n abstract = \t {Recent work by Baratin et al. (2021) sheds light on an intriguing pattern that occurs during the training of deep neural networks: some layers align much more with data compared to other layers (where the alignment is defined as the normalize euclidean product of the tangent features matrix and the data labels matrix). The curve of the alignment as a function of layer index (generally) exhibits a ascent-descent pattern where the maximum is reached for some hidden layer. In this work, we provide the first explanation for this phenomenon. We introduce the Equilibrium Hypothesis which connects this alignment pattern to signal propagation in deep neural networks. Our experiments demonstrate an excellent match with the theoretical predictions.}\n}", "pdf": "https://proceedings.mlr.press/v162/lou22a/lou22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/lou22a-supp.zip", "pdf_size": 6738874, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1302052453476758976&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "St John\u2019s College, University of Oxford, Oxford, UK; PTCL, University of Oxford, Oxford, UK+Department of Physics, University of Oxford, UK; Department of Mathematics, National University of Singapore", "aff_domain": "sjc.ox.ac.uk; ;nus.edu.sg", "email": "sjc.ox.ac.uk; ;nus.edu.sg", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/lou22a.html", "aff_unique_index": "0;0+0;1", "aff_unique_norm": "University of Oxford;National University of Singapore", "aff_unique_dep": "St John\u2019s College;Department of Mathematics", "aff_unique_url": "https://www.ox.ac.uk;https://www.nus.edu.sg", "aff_unique_abbr": "Oxford;NUS", "aff_campus_unique_index": "0;0+0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;0+0;1", "aff_country_unique": "United Kingdom;Singapore" }, { "title": "Feature Space Particle Inference for Neural Network Ensembles", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16979", "id": "16979", "proceeding": "https://proceedings.mlr.press/v162/yashima22a.html", "poster": "/media/PosterPDFs/ICML%202022/7f7c351ee977c765aa8cd5c7020bc38f_dIm3adZ.png?t=1657636635.611995", "slides": "", "author_site": "Shingo Yashima, Teppei Suzuki, Kohta Ishikawa, Ikuro Sato, Rei Kawakami", "author": "Shingo Yashima; Teppei Suzuki; Kohta Ishikawa; Ikuro Sato; Rei Kawakami", "abstract": "Ensembles of deep neural networks demonstrate improved performance over single models. For enhancing the diversity of ensemble members while keeping their performance, particle-based inference methods offer a promising approach from a Bayesian perspective. However, the best way to apply these methods to neural networks is still unclear: seeking samples from the weight-space posterior suffers from inefficiency due to the over-parameterization issues, while seeking samples directly from the function-space posterior often leads to serious underfitting. In this study, we propose to optimize particles in the feature space where activations of a specific intermediate layer lie to alleviate the abovementioned difficulties. Our method encourages each member to capture distinct features, which are expected to increase the robustness of the ensemble prediction. Extensive evaluation on real-world datasets exhibits that our model significantly outperforms the gold-standard Deep Ensembles on various metrics, including accuracy, calibration, and robustness.", "bibtex": "@InProceedings{pmlr-v162-yashima22a,\n title = \t {Feature Space Particle Inference for Neural Network Ensembles},\n author = {Yashima, Shingo and Suzuki, Teppei and Ishikawa, Kohta and Sato, Ikuro and Kawakami, Rei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25452--25468},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yashima22a/yashima22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yashima22a.html},\n abstract = \t {Ensembles of deep neural networks demonstrate improved performance over single models. For enhancing the diversity of ensemble members while keeping their performance, particle-based inference methods offer a promising approach from a Bayesian perspective. However, the best way to apply these methods to neural networks is still unclear: seeking samples from the weight-space posterior suffers from inefficiency due to the over-parameterization issues, while seeking samples directly from the function-space posterior often leads to serious underfitting. In this study, we propose to optimize particles in the feature space where activations of a specific intermediate layer lie to alleviate the abovementioned difficulties. Our method encourages each member to capture distinct features, which are expected to increase the robustness of the ensemble prediction. Extensive evaluation on real-world datasets exhibits that our model significantly outperforms the gold-standard Deep Ensembles on various metrics, including accuracy, calibration, and robustness.}\n}", "pdf": "https://proceedings.mlr.press/v162/yashima22a/yashima22a.pdf", "supp": "", "pdf_size": 464538, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11870961066098934714&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Denso IT Laboratory Inc., Tokyo, Japan+Tokyo Institute of Technology, Tokyo, Japan; Denso IT Laboratory Inc., Tokyo, Japan+Tokyo Institute of Technology, Tokyo, Japan; Denso IT Laboratory Inc., Tokyo, Japan+Tokyo Institute of Technology, Tokyo, Japan; Denso IT Laboratory Inc., Tokyo, Japan+Tokyo Institute of Technology, Tokyo, Japan; Denso IT Laboratory Inc., Tokyo, Japan+Tokyo Institute of Technology, Tokyo, Japan", "aff_domain": "core.d-itlab.co.jp; ; ; ; ", "email": "core.d-itlab.co.jp; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/yashima22a.html", "aff_unique_index": "0+1;0+1;0+1;0+1;0+1", "aff_unique_norm": "Denso IT Laboratory Inc.;Tokyo Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.denso.com;https://www.titech.ac.jp", "aff_unique_abbr": ";Titech", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Tokyo", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", "aff_country_unique": "Japan" }, { "title": "Feature and Parameter Selection in Stochastic Linear Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16553", "id": "16553", "proceeding": "https://proceedings.mlr.press/v162/moradipari22a.html", "poster": "/media/PosterPDFs/ICML%202022/3472ab80b6dff70c54758fd6dfc800c2.png?t=1658252798.614722", "slides": "", "author_site": "Ahmadreza Moradipari, Berkay Turan, Yasin Abbasi-Yadkori, Mahnoosh Alizadeh, Mohammad Ghavamzadeh", "author": "Ahmadreza Moradipari; Berkay Turan; Yasin Abbasi-Yadkori; Mahnoosh Alizadeh; Mohammad Ghavamzadeh", "abstract": "We study two model selection settings in stochastic linear bandits (LB). In the first setting, which we refer to as feature selection, the expected reward of the LB problem is in the linear span of at least one of $M$ feature maps (models). In the second setting, the reward parameter of the LB problem is arbitrarily selected from $M$ models represented as (possibly) overlapping balls in $\\mathbb R^d$. However, the agent only has access to misspecified models, i.e., estimates of the centers and radii of the balls. We refer to this setting as parameter selection. For each setting, we develop and analyze a computationally efficient algorithm that is based on a reduction from bandits to full-information problems. This allows us to obtain regret bounds that are not worse (up to a $\\sqrt{\\log M}$ factor) than the case where the true model is known. This is the best reported dependence on the number of models $M$ in these settings. Finally, we empirically show the effectiveness of our algorithms using synthetic and real-world experiments.", "bibtex": "@InProceedings{pmlr-v162-moradipari22a,\n title = \t {Feature and Parameter Selection in Stochastic Linear Bandits},\n author = {Moradipari, Ahmadreza and Turan, Berkay and Abbasi-Yadkori, Yasin and Alizadeh, Mahnoosh and Ghavamzadeh, Mohammad},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15927--15958},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/moradipari22a/moradipari22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/moradipari22a.html},\n abstract = \t {We study two model selection settings in stochastic linear bandits (LB). In the first setting, which we refer to as feature selection, the expected reward of the LB problem is in the linear span of at least one of $M$ feature maps (models). In the second setting, the reward parameter of the LB problem is arbitrarily selected from $M$ models represented as (possibly) overlapping balls in $\\mathbb R^d$. However, the agent only has access to misspecified models, i.e., estimates of the centers and radii of the balls. We refer to this setting as parameter selection. For each setting, we develop and analyze a computationally efficient algorithm that is based on a reduction from bandits to full-information problems. This allows us to obtain regret bounds that are not worse (up to a $\\sqrt{\\log M}$ factor) than the case where the true model is known. This is the best reported dependence on the number of models $M$ in these settings. Finally, we empirically show the effectiveness of our algorithms using synthetic and real-world experiments.}\n}", "pdf": "https://proceedings.mlr.press/v162/moradipari22a/moradipari22a.pdf", "supp": "", "pdf_size": 4106961, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17680416813726110592&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Electrical and Computer Engineering, University of California, Santa Barbara, USA; Department of Electrical and Computer Engineering, University of California, Santa Barbara, USA; DeepMind, London, UK; Department of Electrical and Computer Engineering, University of California, Santa Barbara, USA; Google Research, Mountain View, USA", "aff_domain": "ucsb.edu; ; ;ucsb.edu; ", "email": "ucsb.edu; ; ;ucsb.edu; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/moradipari22a.html", "aff_unique_index": "0;0;1;0;2", "aff_unique_norm": "University of California, Santa Barbara;DeepMind;Google", "aff_unique_dep": "Department of Electrical and Computer Engineering;;Google Research", "aff_unique_url": "https://www.ucsb.edu;https://deepmind.com;https://research.google", "aff_unique_abbr": "UCSB;DeepMind;Google", "aff_campus_unique_index": "0;0;1;0;2", "aff_campus_unique": "Santa Barbara;London;Mountain View", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Feature selection using e-values", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16745", "id": "16745", "proceeding": "https://proceedings.mlr.press/v162/majumdar22a.html", "poster": "/media/PosterPDFs/ICML%202022/1ce83e5d4135b07c0b82afffbe2b3436.png?t=1657159491.844307", "slides": "/media/icml-2022/Slides/16745.pdf", "author_site": "Subhabrata Majumdar, Snigdhansu Chatterjee", "author": "Subhabrata Majumdar; Snigdhansu Chatterjee", "abstract": "In the context of supervised learning, we introduce the concept of e-value. An e-value is a scalar quantity that represents the proximity of the sampling distribution of parameter estimates in a model trained on a subset of features to that of the model trained on all features (i.e. the full model). Under general conditions, a rank ordering of e-values separates models that contain all essential features from those that do not. For a p-dimensional feature space, this requires fitting only the full model and evaluating p+1 models, as opposed to the traditional requirement of fitting and evaluating 2^p models. The above e-values framework is applicable to a wide range of parametric models. We use data depths and a fast resampling-based algorithm to implement a feature selection procedure, providing consistency results. Through experiments across several model settings and synthetic and real datasets, we establish that the e-values can be a promising general alternative to existing model-specific methods of feature selection.", "bibtex": "@InProceedings{pmlr-v162-majumdar22a,\n title = \t {Feature selection using e-values},\n author = {Majumdar, Subhabrata and Chatterjee, Snigdhansu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14753--14773},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/majumdar22a/majumdar22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/majumdar22a.html},\n abstract = \t {In the context of supervised learning, we introduce the concept of e-value. An e-value is a scalar quantity that represents the proximity of the sampling distribution of parameter estimates in a model trained on a subset of features to that of the model trained on all features (i.e. the full model). Under general conditions, a rank ordering of e-values separates models that contain all essential features from those that do not. For a p-dimensional feature space, this requires fitting only the full model and evaluating p+1 models, as opposed to the traditional requirement of fitting and evaluating 2^p models. The above e-values framework is applicable to a wide range of parametric models. We use data depths and a fast resampling-based algorithm to implement a feature selection procedure, providing consistency results. Through experiments across several model settings and synthetic and real datasets, we establish that the e-values can be a promising general alternative to existing model-specific methods of feature selection.}\n}", "pdf": "https://proceedings.mlr.press/v162/majumdar22a/majumdar22a.pdf", "supp": "", "pdf_size": 744986, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14169974284290385503&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "School of Statistics, University of Minnesota Twin Cities, Minneapolis, MN, USA+Currently at Splunk; School of Statistics, University of Minnesota Twin Cities, Minneapolis, MN, USA", "aff_domain": "splunk.com; ", "email": "splunk.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/majumdar22a.html", "aff_unique_index": "0+1;0", "aff_unique_norm": "University of Minnesota;Splunk", "aff_unique_dep": "School of Statistics;", "aff_unique_url": "https://www.stat.umn.edu;https://www.splunk.com", "aff_unique_abbr": "UMN;Splunk", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Twin Cities;", "aff_country_unique_index": "0+0;0", "aff_country_unique": "United States" }, { "title": "FedNL: Making Newton-Type Methods Applicable to Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17083", "id": "17083", "proceeding": "https://proceedings.mlr.press/v162/safaryan22a.html", "poster": "/media/PosterPDFs/ICML%202022/7385db9a3f11415bc0e9e2625fae3734.png?t=1657204966.4589956", "slides": "", "author_site": "Mher Safaryan, Rustem Islamov, Xun Qian, Peter Richtarik", "author": "Mher Safaryan; Rustem Islamov; Xun Qian; Peter Richtarik", "abstract": "Inspired by recent work of Islamov et al (2021), we propose a family of Federated Newton Learn (\\algname{FedNL}) methods, which we believe is a marked step in the direction of making second-order methods applicable to FL. In contrast to the aforementioned work, \\algname{FedNL} employs a different Hessian learning technique which i) enhances privacy as it does not rely on the training data to be revealed to the coordinating server, ii) makes it applicable beyond generalized linear models, and iii) provably works with general contractive compression operators for compressing the local Hessians, such as Top-$K$ or Rank-$R$, which are vastly superior in practice. Notably, we do not need to rely on error feedback for our methods to work with contractive compressors. Moreover, we develop \\algname{FedNL-PP}, \\algname{FedNL-CR} and \\algname{FedNL-LS}, which are variants of \\algname{FedNL} that support partial participation, and globalization via cubic regularization and line search, respectively, and \\algname{FedNL-BC}, which is a variant that can further benefit from bidirectional compression of gradients and models, i.e., smart uplink gradient and smart downlink model compression. We prove local convergence rates that are independent of the condition number, the number of training data points, and compression variance. Our communication efficient Hessian learning technique provably learns the Hessian at the optimum. Finally, we perform a variety of numerical experiments that show that our \\algname{FedNL} methods have state-of-the-art communication complexity when compared to key baselines.", "bibtex": "@InProceedings{pmlr-v162-safaryan22a,\n title = \t {{F}ed{NL}: Making {N}ewton-Type Methods Applicable to Federated Learning},\n author = {Safaryan, Mher and Islamov, Rustem and Qian, Xun and Richtarik, Peter},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18959--19010},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/safaryan22a/safaryan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/safaryan22a.html},\n abstract = \t {Inspired by recent work of Islamov et al (2021), we propose a family of Federated Newton Learn (\\algname{FedNL}) methods, which we believe is a marked step in the direction of making second-order methods applicable to FL. In contrast to the aforementioned work, \\algname{FedNL} employs a different Hessian learning technique which i) enhances privacy as it does not rely on the training data to be revealed to the coordinating server, ii) makes it applicable beyond generalized linear models, and iii) provably works with general contractive compression operators for compressing the local Hessians, such as Top-$K$ or Rank-$R$, which are vastly superior in practice. Notably, we do not need to rely on error feedback for our methods to work with contractive compressors. Moreover, we develop \\algname{FedNL-PP}, \\algname{FedNL-CR} and \\algname{FedNL-LS}, which are variants of \\algname{FedNL} that support partial participation, and globalization via cubic regularization and line search, respectively, and \\algname{FedNL-BC}, which is a variant that can further benefit from bidirectional compression of gradients and models, i.e., smart uplink gradient and smart downlink model compression. We prove local convergence rates that are independent of the condition number, the number of training data points, and compression variance. Our communication efficient Hessian learning technique provably learns the Hessian at the optimum. Finally, we perform a variety of numerical experiments that show that our \\algname{FedNL} methods have state-of-the-art communication complexity when compared to key baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/safaryan22a/safaryan22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/safaryan22a-supp.zip", "pdf_size": 8364647, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7364741201712659740&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia+Moscow Institute of Physics and Technology (MIPT), Dolgoprudny, Russia+Institut Polytechnique de Paris (IP Paris), Palaiseau, France; King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia+Moscow Institute of Physics and Technology (MIPT), Dolgoprudny, Russia+Institut Polytechnique de Paris (IP Paris), Palaiseau, France; King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia; King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/safaryan22a.html", "aff_unique_index": "0+1+2;0+1+2;0;0", "aff_unique_norm": "King Abdullah University of Science and Technology;Moscow Institute of Physics and Technology;Institut Polytechnique de Paris", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaust.edu.sa;https://www.mipt.ru;https://www.ipparis.fr", "aff_unique_abbr": "KAUST;MIPT;IP Paris", "aff_campus_unique_index": "0+1+2;0+1+2;0;0", "aff_campus_unique": "Thuwal;Dolgoprudny;Palaiseau", "aff_country_unique_index": "0+1+2;0+1+2;0;0", "aff_country_unique": "Saudi Arabia;Russian Federation;France" }, { "title": "FedNest: Federated Bilevel, Minimax, and Compositional Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17791", "id": "17791", "proceeding": "https://proceedings.mlr.press/v162/tarzanagh22a.html", "poster": "/media/PosterPDFs/ICML%202022/eb21cc0143d96dbc8e3a58f1a81e4dd2_0JDo4d4.png?t=1658242368.1592815", "slides": "", "author_site": "Davoud Ataee Tarzanagh, Mingchen Li, Christos Thrampoulidis, Samet Oymak", "author": "Davoud Ataee Tarzanagh; Mingchen Li; Christos Thrampoulidis; Samet Oymak", "abstract": "Standard federated optimization methods successfully apply to stochastic problems with single-level structure. However, many contemporary ML problems - including adversarial robustness, hyperparameter tuning, actor-critic - fall under nested bilevel programming that subsumes minimax and compositional optimization. In this work, we propose FedNest: A federated alternating stochastic gradient method to address general nested problems. We establish provable convergence rates for FedNest in the presence of heterogeneous data and introduce variations for bilevel, minimax, and compositional optimization. FedNest introduces multiple innovations including federated hypergradient computation and variance reduction to address inner-level heterogeneity. We complement our theory with experiments on hyperparameter & hyper-representation learning and minimax optimization that demonstrate the benefits of our method in practice.", "bibtex": "@InProceedings{pmlr-v162-tarzanagh22a,\n title = \t {{F}ed{N}est: Federated Bilevel, Minimax, and Compositional Optimization},\n author = {Tarzanagh, Davoud Ataee and Li, Mingchen and Thrampoulidis, Christos and Oymak, Samet},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21146--21179},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tarzanagh22a/tarzanagh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tarzanagh22a.html},\n abstract = \t {Standard federated optimization methods successfully apply to stochastic problems with single-level structure. However, many contemporary ML problems - including adversarial robustness, hyperparameter tuning, actor-critic - fall under nested bilevel programming that subsumes minimax and compositional optimization. In this work, we propose FedNest: A federated alternating stochastic gradient method to address general nested problems. We establish provable convergence rates for FedNest in the presence of heterogeneous data and introduce variations for bilevel, minimax, and compositional optimization. FedNest introduces multiple innovations including federated hypergradient computation and variance reduction to address inner-level heterogeneity. We complement our theory with experiments on hyperparameter & hyper-representation learning and minimax optimization that demonstrate the benefits of our method in practice.}\n}", "pdf": "https://proceedings.mlr.press/v162/tarzanagh22a/tarzanagh22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/tarzanagh22a-supp.zip", "pdf_size": 1461822, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7138561365880400777&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "University of Michigan; University of California, Riverside; University of British Columbia; University of California, Riverside", "aff_domain": "umich.edu;ucr.edu;ece.ubc.ca;ece.ucr.edu", "email": "umich.edu;ucr.edu;ece.ubc.ca;ece.ucr.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/tarzanagh22a.html", "aff_unique_index": "0;1;2;1", "aff_unique_norm": "University of Michigan;University of California, Riverside;University of British Columbia", "aff_unique_dep": ";;", "aff_unique_url": "https://www.umich.edu;https://www.ucr.edu;https://www.ubc.ca", "aff_unique_abbr": "UM;UCR;UBC", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Riverside", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "FedNew: A Communication-Efficient and Privacy-Preserving Newton-Type Method for Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16853", "id": "16853", "proceeding": "https://proceedings.mlr.press/v162/elgabli22a.html", "poster": "", "slides": "", "author_site": "Anis Elgabli, Chaouki Ben Issaid, Amrit Singh Bedi, Ketan Rajawat, Mehdi Bennis, Vaneet Aggarwal", "author": "Anis Elgabli; Chaouki Ben Issaid; Amrit Singh Bedi; Ketan Rajawat; Mehdi Bennis; Vaneet Aggarwal", "abstract": "Newton-type methods are popular in federated learning due to their fast convergence. Still, they suffer from two main issues, namely: low communication efficiency and low privacy due to the requirement of sending Hessian information from clients to parameter server (PS). In this work, we introduced a novel framework called FedNew in which there is no need to transmit Hessian information from clients to PS, hence resolving the bottleneck to improve communication efficiency. In addition, FedNew hides the gradient information and results in a privacy-preserving approach compared to the existing state-of-the-art. The core novel idea in FedNew is to introduce a two level framework, and alternate between updating the inverse Hessian-gradient product using only one alternating direction method of multipliers (ADMM) step and then performing the global model update using Newton\u2019s method. Though only one ADMM pass is used to approximate the inverse Hessian-gradient product at each iteration, we develop a novel theoretical approach to show the converging behavior of FedNew for convex problems. Additionally, a significant reduction in communication overhead is achieved by utilizing stochastic quantization. Numerical results using real datasets show the superiority of FedNew compared to existing methods in terms of communication costs.", "bibtex": "@InProceedings{pmlr-v162-elgabli22a,\n title = \t {{F}ed{N}ew: A Communication-Efficient and Privacy-Preserving {N}ewton-Type Method for Federated Learning},\n author = {Elgabli, Anis and Issaid, Chaouki Ben and Bedi, Amrit Singh and Rajawat, Ketan and Bennis, Mehdi and Aggarwal, Vaneet},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5861--5877},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/elgabli22a/elgabli22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/elgabli22a.html},\n abstract = \t {Newton-type methods are popular in federated learning due to their fast convergence. Still, they suffer from two main issues, namely: low communication efficiency and low privacy due to the requirement of sending Hessian information from clients to parameter server (PS). In this work, we introduced a novel framework called FedNew in which there is no need to transmit Hessian information from clients to PS, hence resolving the bottleneck to improve communication efficiency. In addition, FedNew hides the gradient information and results in a privacy-preserving approach compared to the existing state-of-the-art. The core novel idea in FedNew is to introduce a two level framework, and alternate between updating the inverse Hessian-gradient product using only one alternating direction method of multipliers (ADMM) step and then performing the global model update using Newton\u2019s method. Though only one ADMM pass is used to approximate the inverse Hessian-gradient product at each iteration, we develop a novel theoretical approach to show the converging behavior of FedNew for convex problems. Additionally, a significant reduction in communication overhead is achieved by utilizing stochastic quantization. Numerical results using real datasets show the superiority of FedNew compared to existing methods in terms of communication costs.}\n}", "pdf": "https://proceedings.mlr.press/v162/elgabli22a/elgabli22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/elgabli22a-supp.zip", "pdf_size": 514965, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13605239667986344129&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "University of Oulu; University of Oulu; University of Marylad, College Park, MD, USA; Indian Institute of Technology Kanpur, India; University of Oulu; Purdue University, IN, USA", "aff_domain": "oulu.fi; ; ; ; ; ", "email": "oulu.fi; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/elgabli22a.html", "aff_unique_index": "0;0;1;2;0;3", "aff_unique_norm": "University of Oulu;University of Maryland;Indian Institute of Technology Kanpur;Purdue University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.oulu.fi;https://www/umd.edu;https://www.iitk.ac.in;https://www.purdue.edu", "aff_unique_abbr": "UOulu;UMD;IIT Kanpur;Purdue", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";College Park;Kanpur;Indiana", "aff_country_unique_index": "0;0;1;2;0;1", "aff_country_unique": "Finland;United States;India" }, { "title": "FedScale: Benchmarking Model and System Performance of Federated Learning at Scale", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16277", "id": "16277", "proceeding": "https://proceedings.mlr.press/v162/lai22a.html", "poster": "/media/PosterPDFs/ICML%202022/731c83db8d2ff01bdc000083fd3c3740.png?t=1658123958.6979942", "slides": "", "author_site": "Fan Lai, Yinwei Dai, Sanjay Singapuram, Jiachen Liu, Xiangfeng Zhu, Harsha Madhyastha, Mosharaf Chowdhury", "author": "Fan Lai; Yinwei Dai; Sanjay Singapuram; Jiachen Liu; Xiangfeng Zhu; Harsha Madhyastha; Mosharaf Chowdhury", "abstract": "We present FedScale, a federated learning (FL) benchmarking suite with realistic datasets and a scalable runtime to enable reproducible FL research. FedScale datasets encompass a wide range of critical FL tasks, ranging from image classification and object detection to language modeling and speech recognition. Each dataset comes with a unified evaluation protocol using real-world data splits and evaluation metrics. To reproduce realistic FL behavior, FedScale contains a scalable and extensible runtime. It provides high-level APIs to implement FL algorithms, deploy them at scale across diverse hardware and software backends, and evaluate them at scale, all with minimal developer efforts. We combine the two to perform systematic benchmarking experiments and highlight potential opportunities for heterogeneity-aware co-optimizations in FL. FedScale is open-source and actively maintained by contributors from different institutions at http://fedscale.ai. We welcome feedback and contributions from the community.", "bibtex": "@InProceedings{pmlr-v162-lai22a,\n title = \t {{F}ed{S}cale: Benchmarking Model and System Performance of Federated Learning at Scale},\n author = {Lai, Fan and Dai, Yinwei and Singapuram, Sanjay and Liu, Jiachen and Zhu, Xiangfeng and Madhyastha, Harsha and Chowdhury, Mosharaf},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11814--11827},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lai22a/lai22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lai22a.html},\n abstract = \t {We present FedScale, a federated learning (FL) benchmarking suite with realistic datasets and a scalable runtime to enable reproducible FL research. FedScale datasets encompass a wide range of critical FL tasks, ranging from image classification and object detection to language modeling and speech recognition. Each dataset comes with a unified evaluation protocol using real-world data splits and evaluation metrics. To reproduce realistic FL behavior, FedScale contains a scalable and extensible runtime. It provides high-level APIs to implement FL algorithms, deploy them at scale across diverse hardware and software backends, and evaluate them at scale, all with minimal developer efforts. We combine the two to perform systematic benchmarking experiments and highlight potential opportunities for heterogeneity-aware co-optimizations in FL. FedScale is open-source and actively maintained by contributors from different institutions at http://fedscale.ai. We welcome feedback and contributions from the community.}\n}", "pdf": "https://proceedings.mlr.press/v162/lai22a/lai22a.pdf", "supp": "", "pdf_size": 864060, "gs_citation": 299, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9366536104914467915&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 23, "aff": "Department of Computer Science, University of Michigan; Department of Computer Science, University of Michigan; Department of Computer Science, University of Michigan; Department of Computer Science, University of Michigan; Department of Computer Science, University of Michigan + Department of Computer Science, University of Washington; Department of Computer Science, University of Michigan; Department of Computer Science, University of Michigan", "aff_domain": "umich.edu; ; ; ; ; ; ", "email": "umich.edu; ; ; ; ; ; ", "github": "", "project": "http://fedscale.ai", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/lai22a.html", "aff_unique_index": "0;0;0;0;0+1;0;0", "aff_unique_norm": "University of Michigan;University of Washington", "aff_unique_dep": "Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.umich.edu;https://www.washington.edu", "aff_unique_abbr": "UM;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0;0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "Federated Learning with Label Distribution Skew via Logits Calibration", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16221", "id": "16221", "proceeding": "https://proceedings.mlr.press/v162/zhang22p.html", "poster": "", "slides": "", "author_site": "Jie Zhang, Zhiqi Li, Bo Li, Jianghe Xu, Shuang Wu, Shouhong Ding, Chao Wu", "gs_citation": 193, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17283094020787492185&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "author": "", "aff": "", "aff_domain": "", "email": "", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/zhang22p.html" }, { "title": "Federated Learning with Partial Model Personalization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16615", "id": "16615", "proceeding": "https://proceedings.mlr.press/v162/pillutla22a.html", "poster": "/media/PosterPDFs/ICML%202022/ac0b236e346da355400a90fcc7e28be6.png?t=1657643561.5627902", "slides": "", "author_site": "Krishna Pillutla, Kshitiz Malik, Abdel-rahman Mohamed, Michael Rabbat, Maziar Sanjabi, Lin Xiao", "author": "Krishna Pillutla; Kshitiz Malik; Abdel-Rahman Mohamed; Mike Rabbat; Maziar Sanjabi; Lin Xiao", "abstract": "We consider two federated learning algorithms for training partially personalized models, where the shared and personal parameters are updated either simultaneously or alternately on the devices. Both algorithms have been proposed in the literature, but their convergence properties are not fully understood, especially for the alternating variant. We provide convergence analyses of both algorithms in the general nonconvex setting with partial participation and delineate the regime where one dominates the other. Our experiments on real-world image, text, and speech datasets demonstrate that (a) partial personalization can obtain most of the benefits of full model personalization with a small fraction of personal parameters, and, (b) the alternating update algorithm outperforms the simultaneous update algorithm by a small but consistent margin.", "bibtex": "@InProceedings{pmlr-v162-pillutla22a,\n title = \t {Federated Learning with Partial Model Personalization},\n author = {Pillutla, Krishna and Malik, Kshitiz and Mohamed, Abdel-Rahman and Rabbat, Mike and Sanjabi, Maziar and Xiao, Lin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17716--17758},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pillutla22a/pillutla22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pillutla22a.html},\n abstract = \t {We consider two federated learning algorithms for training partially personalized models, where the shared and personal parameters are updated either simultaneously or alternately on the devices. Both algorithms have been proposed in the literature, but their convergence properties are not fully understood, especially for the alternating variant. We provide convergence analyses of both algorithms in the general nonconvex setting with partial participation and delineate the regime where one dominates the other. Our experiments on real-world image, text, and speech datasets demonstrate that (a) partial personalization can obtain most of the benefits of full model personalization with a small fraction of personal parameters, and, (b) the alternating update algorithm outperforms the simultaneous update algorithm by a small but consistent margin.}\n}", "pdf": "https://proceedings.mlr.press/v162/pillutla22a/pillutla22a.pdf", "supp": "", "pdf_size": 1201141, "gs_citation": 209, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4750968691898857474&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Paul G. Allen School of Computer Science & Engineering, University of Washington; Meta AI; Meta AI; Meta AI; Meta AI; Meta AI", "aff_domain": "cs.washington.edu; ; ; ; ; ", "email": "cs.washington.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/pillutla22a.html", "aff_unique_index": "0;1;1;1;1;1", "aff_unique_norm": "University of Washington;Meta", "aff_unique_dep": "Paul G. Allen School of Computer Science & Engineering;Meta AI", "aff_unique_url": "https://www.washington.edu;https://meta.com", "aff_unique_abbr": "UW;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Federated Learning with Positive and Unlabeled Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17089", "id": "17089", "proceeding": "https://proceedings.mlr.press/v162/lin22b.html", "poster": "/media/PosterPDFs/ICML%202022/16e6a3326dd7d868cbc926602a61e4d0.png?t=1657248979.552321", "slides": "", "author_site": "Xinyang Lin, Hanting Chen, Yixing Xu, Chao Xu, Xiaolin Gui, Yiping Deng, Yunhe Wang", "author": "Xinyang Lin; Hanting Chen; Yixing Xu; Chao Xu; Xiaolin Gui; Yiping Deng; Yunhe Wang", "abstract": "We study the problem of learning from positive and unlabeled (PU) data in the federated setting, where each client only labels a little part of their dataset due to the limitation of resources and time. Different from the settings in traditional PU learning where the negative class consists of a single class, the negative samples which cannot be identified by a client in the federated setting may come from multiple classes which are unknown to the client. Therefore, existing PU learning methods can be hardly applied in this situation. To address this problem, we propose a novel framework, namely Federated learning with Positive and Unlabeled data (FedPU), to minimize the expected risk of multiple negative classes by leveraging the labeled data in other clients. We theoretically analyze the generalization bound of the proposed FedPU. Empirical experiments show that the FedPU can achieve much better performance than conventional supervised and semi-supervised federated learning methods.", "bibtex": "@InProceedings{pmlr-v162-lin22b,\n title = \t {Federated Learning with Positive and Unlabeled Data},\n author = {Lin, Xinyang and Chen, Hanting and Xu, Yixing and Xu, Chao and Gui, Xiaolin and Deng, Yiping and Wang, Yunhe},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13344--13355},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lin22b/lin22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/lin22b.html},\n abstract = \t {We study the problem of learning from positive and unlabeled (PU) data in the federated setting, where each client only labels a little part of their dataset due to the limitation of resources and time. Different from the settings in traditional PU learning where the negative class consists of a single class, the negative samples which cannot be identified by a client in the federated setting may come from multiple classes which are unknown to the client. Therefore, existing PU learning methods can be hardly applied in this situation. To address this problem, we propose a novel framework, namely Federated learning with Positive and Unlabeled data (FedPU), to minimize the expected risk of multiple negative classes by leveraging the labeled data in other clients. We theoretically analyze the generalization bound of the proposed FedPU. Empirical experiments show that the FedPU can achieve much better performance than conventional supervised and semi-supervised federated learning methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/lin22b/lin22b.pdf", "supp": "", "pdf_size": 533289, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5808543531345013860&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Faculty of Electronic and Information Engineering, Xi\u2019an Jiaotong University+Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Key Lab of Machine Perception (MOE), Department of Machine Intelligence, Peking University, China; Faculty of Electronic and Information Engineering, Xi\u2019an Jiaotong University; Central Software Institution, Huawei Technologies; Huawei Noah\u2019s Ark Lab+Central Software Institution, Huawei Technologies", "aff_domain": "xjtu.edu.cn;huawei.com;huawei.com;pku.edu.cn;xjtu.edu.cn;huawei.com;huawei.com", "email": "xjtu.edu.cn;huawei.com;huawei.com;pku.edu.cn;xjtu.edu.cn;huawei.com;huawei.com", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/lin22b.html", "aff_unique_index": "0+1;1;1;2;0;1;1+1", "aff_unique_norm": "Xi'an Jiao Tong University;Huawei;Peking University", "aff_unique_dep": "Faculty of Electronic and Information Engineering;Noah\u2019s Ark Lab;Department of Machine Intelligence", "aff_unique_url": "http://www.xjtu.edu.cn;https://www.huawei.com;http://www.pku.edu.cn", "aff_unique_abbr": "XJTU;Huawei;Peking University", "aff_campus_unique_index": "0;0;", "aff_campus_unique": "Xi'an;", "aff_country_unique_index": "0+0;0;0;0;0;0;0+0", "aff_country_unique": "China" }, { "title": "Federated Minimax Optimization: Improved Convergence Analyses and Algorithms", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17435", "id": "17435", "proceeding": "https://proceedings.mlr.press/v162/sharma22c.html", "poster": "/media/PosterPDFs/ICML%202022/596f713f9a7376fe90a62abaaedecc2d.png?t=1658001884.910279", "slides": "/media/icml-2022/Slides/17435.pdf", "author_site": "PRANAY SHARMA, Rohan Panda, Gauri Joshi, Pramod K Varshney", "author": "Pranay Sharma; Rohan Panda; Gauri Joshi; Pramod Varshney", "abstract": "In this paper, we consider nonconvex minimax optimization, which is gaining prominence in many modern machine learning applications, such as GANs. Large-scale edge-based collection of training data in these applications calls for communication-efficient distributed optimization algorithms, such as those used in federated learning, to process the data. In this paper, we analyze local stochastic gradient descent ascent (SGDA), the local-update version of the SGDA algorithm. SGDA is the core algorithm used in minimax optimization, but it is not well-understood in a distributed setting. We prove that Local SGDA has", "bibtex": "@InProceedings{pmlr-v162-sharma22c,\n title = \t {Federated Minimax Optimization: Improved Convergence Analyses and Algorithms},\n author = {Sharma, Pranay and Panda, Rohan and Joshi, Gauri and Varshney, Pramod},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19683--19730},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sharma22c/sharma22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/sharma22c.html},\n abstract = \t {In this paper, we consider nonconvex minimax optimization, which is gaining prominence in many modern machine learning applications, such as GANs. Large-scale edge-based collection of training data in these applications calls for communication-efficient distributed optimization algorithms, such as those used in federated learning, to process the data. In this paper, we analyze local stochastic gradient descent ascent (SGDA), the local-update version of the SGDA algorithm. SGDA is the core algorithm used in minimax optimization, but it is not well-understood in a distributed setting. We prove that Local SGDA has", "pdf": "https://proceedings.mlr.press/v162/sharma22c/sharma22c.pdf", "supp": "", "pdf_size": 777287, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10245452703943219447&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA; Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA; Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA; Department of Electrical Engineering and Computer Science, Syracuse University, Syracuse, NY", "aff_domain": "andrew.cmu.edu; ; ; ", "email": "andrew.cmu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/sharma22c.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Carnegie Mellon University;Syracuse University", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Electrical Engineering and Computer Science", "aff_unique_url": "https://www.cmu.edu;https://www.syracuse.edu", "aff_unique_abbr": "CMU;Syracuse", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Pittsburgh;Syracuse", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Federated Reinforcement Learning: Linear Speedup Under Markovian Sampling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16655", "id": "16655", "proceeding": "https://proceedings.mlr.press/v162/khodadadian22a.html", "poster": "/media/PosterPDFs/ICML%202022/3a20f62a0af1aa152670bab3c602feed.png?t=1657932178.067209", "slides": "", "author_site": "sajad khodadadian, PRANAY SHARMA, Gauri Joshi, Siva Maguluri", "author": "Sajad Khodadadian; Pranay Sharma; Gauri Joshi; Siva Theja Maguluri", "abstract": "Since reinforcement learning algorithms are notoriously data-intensive, the task of sampling observations from the environment is usually split across multiple agents. However, transferring these observations from the agents to a central location can be prohibitively expensive in terms of the communication cost, and it can also compromise the privacy of each agent\u2019s local behavior policy. In this paper, we consider a federated reinforcement learning framework where multiple agents collaboratively learn a global model, without sharing their individual data and policies. Each agent maintains a local copy of the model and updates it using locally sampled data. Although having N agents enables the sampling of N times more data, it is not clear if it leads to proportional convergence speedup. We propose federated versions of on-policy TD, off-policy TD and Q-learning, and analyze their convergence. For all these algorithms, to the best of our knowledge, we are the first to consider Markovian noise and multiple local updates, and prove a linear convergence speedup with respect to the number of agents. To obtain these results, we show that federated TD and Q-learning are special cases of a general framework for federated stochastic approximation with Markovian noise, and we leverage this framework to provide a unified convergence analysis that applies to all the algorithms.", "bibtex": "@InProceedings{pmlr-v162-khodadadian22a,\n title = \t {Federated Reinforcement Learning: Linear Speedup Under {M}arkovian Sampling},\n author = {Khodadadian, Sajad and Sharma, Pranay and Joshi, Gauri and Maguluri, Siva Theja},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10997--11057},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/khodadadian22a/khodadadian22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/khodadadian22a.html},\n abstract = \t {Since reinforcement learning algorithms are notoriously data-intensive, the task of sampling observations from the environment is usually split across multiple agents. However, transferring these observations from the agents to a central location can be prohibitively expensive in terms of the communication cost, and it can also compromise the privacy of each agent\u2019s local behavior policy. In this paper, we consider a federated reinforcement learning framework where multiple agents collaboratively learn a global model, without sharing their individual data and policies. Each agent maintains a local copy of the model and updates it using locally sampled data. Although having N agents enables the sampling of N times more data, it is not clear if it leads to proportional convergence speedup. We propose federated versions of on-policy TD, off-policy TD and Q-learning, and analyze their convergence. For all these algorithms, to the best of our knowledge, we are the first to consider Markovian noise and multiple local updates, and prove a linear convergence speedup with respect to the number of agents. To obtain these results, we show that federated TD and Q-learning are special cases of a general framework for federated stochastic approximation with Markovian noise, and we leverage this framework to provide a unified convergence analysis that applies to all the algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/khodadadian22a/khodadadian22a.pdf", "supp": "", "pdf_size": 1045879, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10020216043410692900&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "H. Milton Stewart School of Industrial & Systems Engineering, Georgia Institute of Technology, Atlanta, GA, 30332, USA; Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, 15213, USA; Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, 15213, USA; H. Milton Stewart School of Industrial & Systems Engineering, Georgia Institute of Technology, Atlanta, GA, 30332, USA", "aff_domain": "gatech.edu; ; ; ", "email": "gatech.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/khodadadian22a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Georgia Institute of Technology;Carnegie Mellon University", "aff_unique_dep": "H. Milton Stewart School of Industrial & Systems Engineering;Electrical and Computer Engineering", "aff_unique_url": "https://www.gatech.edu;https://www.cmu.edu", "aff_unique_abbr": "Georgia Tech;CMU", "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Atlanta;Pittsburgh", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fenrir: Physics-Enhanced Regression for Initial Value Problems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17163", "id": "17163", "proceeding": "https://proceedings.mlr.press/v162/tronarp22a.html", "poster": "/media/PosterPDFs/ICML%202022/b59307fdacf7b2db12ec4bd5ca1caba8.png?t=1656329055.1729488", "slides": "/media/icml-2022/Slides/17163.pdf", "author_site": "Filip Tronarp, Nathanael Bosch, Philipp Hennig", "author": "Filip Tronarp; Nathanael Bosch; Philipp Hennig", "abstract": "We show how probabilistic numerics can be used to convert an initial value problem into a Gauss\u2013Markov process parametrised by the dynamics of the initial value problem. Consequently, the often difficult problem of parameter estimation in ordinary differential equations is reduced to hyper-parameter estimation in Gauss\u2013Markov regression, which tends to be considerably easier. The method\u2019s relation and benefits in comparison to classical numerical integration and gradient matching approaches is elucidated. In particular, the method can, in contrast to gradient matching, handle partial observations, and has certain routes for escaping local optima not available to classical numerical integration. Experimental results demonstrate that the method is on par or moderately better than competing approaches.", "bibtex": "@InProceedings{pmlr-v162-tronarp22a,\n title = \t {Fenrir: Physics-Enhanced Regression for Initial Value Problems},\n author = {Tronarp, Filip and Bosch, Nathanael and Hennig, Philipp},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21776--21794},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tronarp22a/tronarp22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tronarp22a.html},\n abstract = \t {We show how probabilistic numerics can be used to convert an initial value problem into a Gauss\u2013Markov process parametrised by the dynamics of the initial value problem. Consequently, the often difficult problem of parameter estimation in ordinary differential equations is reduced to hyper-parameter estimation in Gauss\u2013Markov regression, which tends to be considerably easier. The method\u2019s relation and benefits in comparison to classical numerical integration and gradient matching approaches is elucidated. In particular, the method can, in contrast to gradient matching, handle partial observations, and has certain routes for escaping local optima not available to classical numerical integration. Experimental results demonstrate that the method is on par or moderately better than competing approaches.}\n}", "pdf": "https://proceedings.mlr.press/v162/tronarp22a/tronarp22a.pdf", "supp": "", "pdf_size": 935069, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12271464698173319316&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, University of T\u00fcbingen, T\u00fcbingen, Germany+Max\u2013Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Department of Computer Science, University of T\u00fcbingen, T\u00fcbingen, Germany+Max\u2013Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Department of Computer Science, University of T\u00fcbingen, T\u00fcbingen, Germany+Max\u2013Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de; ", "email": "uni-tuebingen.de;uni-tuebingen.de; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/tronarp22a.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "University of T\u00fcbingen;Max-Planck Institute for Intelligent Systems", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.uni-tuebingen.de;https://www.mpi-is.mpg.de", "aff_unique_abbr": ";MPI-IS", "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "T\u00fcbingen", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "Germany" }, { "title": "Fictitious Play and Best-Response Dynamics in Identical Interest and Zero-Sum Stochastic Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17363", "id": "17363", "proceeding": "https://proceedings.mlr.press/v162/baudin22a.html", "poster": "/media/PosterPDFs/ICML%202022/f6185f0ef02dcaec414a3171cd01c697.png?t=1658133189.3209274", "slides": "", "author_site": "Lucas Baudin, Rida Laraki", "author": "Lucas Baudin; Rida Laraki", "abstract": "This paper proposes an extension of a popular decentralized discrete-time learning procedure when repeating a static game called fictitious play (FP) (Brown, 1951; Robinson, 1951) to a dynamic model called discounted stochastic game (Shapley, 1953). Our family of discrete-time FP procedures is proven to converge to the set of stationary Nash equilibria in identical interest discounted stochastic games. This extends similar convergence results for static games (Monderer & Shapley, 1996a). We then analyze the continuous-time counterpart of our FP procedures, which include as a particular case the best-response dynamic introduced and studied by Leslie et al. (2020) in the context of zero-sum stochastic games. We prove the converge of this dynamics to stationary Nash equilibria in identical-interest and zero-sum discounted stochastic games. Thanks to stochastic approximations, we can infer from the continuous-time convergence some discrete time results such as the convergence to stationary equilibria in zero-sum and team stochastic games (Holler, 2020).", "bibtex": "@InProceedings{pmlr-v162-baudin22a,\n title = \t {Fictitious Play and Best-Response Dynamics in Identical Interest and Zero-Sum Stochastic Games},\n author = {Baudin, Lucas and Laraki, Rida},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1664--1690},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/baudin22a/baudin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/baudin22a.html},\n abstract = \t {This paper proposes an extension of a popular decentralized discrete-time learning procedure when repeating a static game called fictitious play (FP) (Brown, 1951; Robinson, 1951) to a dynamic model called discounted stochastic game (Shapley, 1953). Our family of discrete-time FP procedures is proven to converge to the set of stationary Nash equilibria in identical interest discounted stochastic games. This extends similar convergence results for static games (Monderer & Shapley, 1996a). We then analyze the continuous-time counterpart of our FP procedures, which include as a particular case the best-response dynamic introduced and studied by Leslie et al. (2020) in the context of zero-sum stochastic games. We prove the converge of this dynamics to stationary Nash equilibria in identical-interest and zero-sum discounted stochastic games. Thanks to stochastic approximations, we can infer from the continuous-time convergence some discrete time results such as the convergence to stationary equilibria in zero-sum and team stochastic games (Holler, 2020).}\n}", "pdf": "https://proceedings.mlr.press/v162/baudin22a/baudin22a.pdf", "supp": "", "pdf_size": 409910, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13607670484444001011&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Universit\u00e9 Paris-Dauphine - PSL, France+University of Liverpool, United Kingdom; Universit\u00e9 Paris-Dauphine - PSL, France+University of Liverpool, United Kingdom", "aff_domain": "dauphine.eu; ", "email": "dauphine.eu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/baudin22a.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Universit\u00e9 Paris-Dauphine;University of Liverpool", "aff_unique_dep": ";", "aff_unique_url": "https://www.univ-paris-dauphine.fr;https://www.liverpool.ac.uk", "aff_unique_abbr": "UPD;Liv Uni", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1", "aff_country_unique": "France;United Kingdom" }, { "title": "Fighting Fire with Fire: Avoiding DNN Shortcuts through Priming", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17399", "id": "17399", "proceeding": "https://proceedings.mlr.press/v162/wen22d.html", "poster": "/media/PosterPDFs/ICML%202022/97d98119037c5b8a9663cb21fb8ebf47_SGx7acI.png?t=1658162181.4910822", "slides": "", "author_site": "Chuan Wen, Jianing Qian, Jierui Lin, Jiaye Teng, Dinesh Jayaraman, Yang Gao", "author": "Chuan Wen; Jianing Qian; Jierui Lin; Jiaye Teng; Dinesh Jayaraman; Yang Gao", "abstract": "Across applications spanning supervised classification and sequential control, deep learning has been reported to find \u201cshortcut\u201d solutions that fail catastrophically under minor changes in the data distribution. In this paper, we show empirically that DNNs can be coaxed to avoid poor shortcuts by providing an additional \u201cpriming\u201d feature computed from key input features, usually a coarse output estimate. Priming relies on approximate domain knowledge of these task-relevant key input features, which is often easy to obtain in practical settings. For example, one might prioritize recent frames over past frames in a video input for visual imitation learning, or salient foreground over background pixels for image classification. On NICO image classification, MuJoCo continuous control, and CARLA autonomous driving, our priming strategy works significantly better than several popular state-of-the-art approaches for feature selection and data augmentation. We connect these empirical findings to recent theoretical results on DNN optimization, and argue theoretically that priming distracts the optimizer away from poor shortcuts by creating better, simpler shortcuts.", "bibtex": "@InProceedings{pmlr-v162-wen22d,\n title = \t {Fighting Fire with Fire: Avoiding {DNN} Shortcuts through Priming},\n author = {Wen, Chuan and Qian, Jianing and Lin, Jierui and Teng, Jiaye and Jayaraman, Dinesh and Gao, Yang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23723--23750},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wen22d/wen22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/wen22d.html},\n abstract = \t {Across applications spanning supervised classification and sequential control, deep learning has been reported to find \u201cshortcut\u201d solutions that fail catastrophically under minor changes in the data distribution. In this paper, we show empirically that DNNs can be coaxed to avoid poor shortcuts by providing an additional \u201cpriming\u201d feature computed from key input features, usually a coarse output estimate. Priming relies on approximate domain knowledge of these task-relevant key input features, which is often easy to obtain in practical settings. For example, one might prioritize recent frames over past frames in a video input for visual imitation learning, or salient foreground over background pixels for image classification. On NICO image classification, MuJoCo continuous control, and CARLA autonomous driving, our priming strategy works significantly better than several popular state-of-the-art approaches for feature selection and data augmentation. We connect these empirical findings to recent theoretical results on DNN optimization, and argue theoretically that priming distracts the optimizer away from poor shortcuts by creating better, simpler shortcuts.}\n}", "pdf": "https://proceedings.mlr.press/v162/wen22d/wen22d.pdf", "supp": "", "pdf_size": 2691868, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5144181957851473106&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University; University of Pennsylvania; University of Texas at Austin; Institute for Interdisciplinary Information Sciences, Tsinghua University; University of Pennsylvania; Shanghai Qi Zhi Institute", "aff_domain": "tsinghua.edu.cn; ; ; ; ;tsinghua.edu.cn", "email": "tsinghua.edu.cn; ; ; ; ;tsinghua.edu.cn", "github": "", "project": "https://sites.google.com/view/icml22-fighting-fire-with-fire/", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wen22d.html", "aff_unique_index": "0;1;2;0;1;3", "aff_unique_norm": "Tsinghua University;University of Pennsylvania;University of Texas at Austin;Shanghai Qi Zhi Institute", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.upenn.edu;https://www.utexas.edu;https://www.qz.io", "aff_unique_abbr": "Tsinghua;UPenn;UT Austin;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;1;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Finding Global Homophily in Graph Neural Networks When Meeting Heterophily", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16869", "id": "16869", "proceeding": "https://proceedings.mlr.press/v162/li22ad.html", "poster": "/media/PosterPDFs/ICML%202022/063e26c670d07bb7c4d30e6fc69fe056.png?t=1658200140.2691686", "slides": "", "author_site": "Xiang Li, Renyu Zhu, Yao Cheng, Caihua Shan, Siqiang Luo, Dongsheng Li, Weining Qian", "author": "Xiang Li; Renyu Zhu; Yao Cheng; Caihua Shan; Siqiang Luo; Dongsheng Li; Weining Qian", "abstract": "We investigate graph neural networks on graphs with heterophily. Some existing methods amplify a node\u2019s neighborhood with multi-hop neighbors to include more nodes with homophily. However, it is a significant challenge to set personalized neighborhood sizes for different nodes. Further, for other homophilous nodes excluded in the neighborhood, they are ignored for information aggregation. To address these problems, we propose two models GloGNN and GloGNN++, which generate a node\u2019s embedding by aggregating information from global nodes in the graph. In each layer, both models learn a coefficient matrix to capture the correlations between nodes, based on which neighborhood aggregation is performed. The coefficient matrix allows signed values and is derived from an optimization problem that has a closed-form solution. We further accelerate neighborhood aggregation and derive a linear time complexity. We theoretically explain the models\u2019 effectiveness by proving that both the coefficient matrix and the generated node embedding matrix have the desired grouping effect. We conduct extensive experiments to compare our models against 11 other competitors on 15 benchmark datasets in a wide range of domains, scales and graph heterophilies. Experimental results show that our methods achieve superior performance and are also very efficient.", "bibtex": "@InProceedings{pmlr-v162-li22ad,\n title = \t {Finding Global Homophily in Graph Neural Networks When Meeting Heterophily},\n author = {Li, Xiang and Zhu, Renyu and Cheng, Yao and Shan, Caihua and Luo, Siqiang and Li, Dongsheng and Qian, Weining},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13242--13256},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22ad/li22ad.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22ad.html},\n abstract = \t {We investigate graph neural networks on graphs with heterophily. Some existing methods amplify a node\u2019s neighborhood with multi-hop neighbors to include more nodes with homophily. However, it is a significant challenge to set personalized neighborhood sizes for different nodes. Further, for other homophilous nodes excluded in the neighborhood, they are ignored for information aggregation. To address these problems, we propose two models GloGNN and GloGNN++, which generate a node\u2019s embedding by aggregating information from global nodes in the graph. In each layer, both models learn a coefficient matrix to capture the correlations between nodes, based on which neighborhood aggregation is performed. The coefficient matrix allows signed values and is derived from an optimization problem that has a closed-form solution. We further accelerate neighborhood aggregation and derive a linear time complexity. We theoretically explain the models\u2019 effectiveness by proving that both the coefficient matrix and the generated node embedding matrix have the desired grouping effect. We conduct extensive experiments to compare our models against 11 other competitors on 15 benchmark datasets in a wide range of domains, scales and graph heterophilies. Experimental results show that our methods achieve superior performance and are also very efficient.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22ad/li22ad.pdf", "supp": "", "pdf_size": 969124, "gs_citation": 263, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=881393506933530763&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "School of Data Science and Engineering, East China Normal University, Shanghai, China; School of Data Science and Engineering, East China Normal University, Shanghai, China; School of Data Science and Engineering, East China Normal University, Shanghai, China; Microsoft Research Asia, Shanghai, China; School of Computer Science and Engineering, Nanyang Technological University, Singapore; Microsoft Research Asia, Shanghai, China; School of Data Science and Engineering, East China Normal University, Shanghai, China", "aff_domain": "dase.ecnu.edu.cn; ; ; ; ; ; ", "email": "dase.ecnu.edu.cn; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/li22ad.html", "aff_unique_index": "0;0;0;1;2;1;0", "aff_unique_norm": "East China Normal University;Microsoft;Nanyang Technological University", "aff_unique_dep": "School of Data Science and Engineering;Research;School of Computer Science and Engineering", "aff_unique_url": "http://www.ecnu.edu.cn;https://www.microsoft.com/en-us/research/group/asia;https://www.ntu.edu.sg", "aff_unique_abbr": "ECNU;MSRA;NTU", "aff_campus_unique_index": "0;0;0;0;1;0;0", "aff_campus_unique": "Shanghai;Singapore", "aff_country_unique_index": "0;0;0;0;1;0;0", "aff_country_unique": "China;Singapore" }, { "title": "Finding the Task-Optimal Low-Bit Sub-Distribution in Deep Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16647", "id": "16647", "proceeding": "https://proceedings.mlr.press/v162/dong22a.html", "poster": "/media/PosterPDFs/ICML%202022/f61d6947467ccd3aa5af24db320235dd_FR0Mhqd.png?t=1656087086.7096856", "slides": "/media/icml-2022/Slides/16647.pdf", "author_site": "Runpei Dong, Zhanhong Tan, Mengdi Wu, Linfeng Zhang, Kaisheng Ma", "author": "Runpei Dong; Zhanhong Tan; Mengdi Wu; Linfeng Zhang; Kaisheng Ma", "abstract": "Quantized neural networks typically require smaller memory footprints and lower computation complexity, which is crucial for efficient deployment. However, quantization inevitably leads to a distribution divergence from the original network, which generally degrades the performance. To tackle this issue, massive efforts have been made, but most existing approaches lack statistical considerations and depend on several manual configurations. In this paper, we present an adaptive-mapping quantization method to learn an optimal latent sub-distribution that is inherent within models and smoothly approximated with a concrete Gaussian Mixture (GM). In particular, the network weights are projected in compliance with the GM-approximated sub-distribution. This sub-distribution evolves along with the weight update in a co-tuning schema guided by the direct task-objective optimization. Sufficient experiments on image classification and object detection over various modern architectures demonstrate the effectiveness, generalization property, and transferability of the proposed method. Besides, an efficient deployment flow for the mobile CPU is developed, achieving up to 7.46$\\times$ inference acceleration on an octa-core ARM CPU. Our codes have been publicly released at https://github.com/RunpeiDong/DGMS.", "bibtex": "@InProceedings{pmlr-v162-dong22a,\n title = \t {Finding the Task-Optimal Low-Bit Sub-Distribution in Deep Neural Networks},\n author = {Dong, Runpei and Tan, Zhanhong and Wu, Mengdi and Zhang, Linfeng and Ma, Kaisheng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5343--5359},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dong22a/dong22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dong22a.html},\n abstract = \t {Quantized neural networks typically require smaller memory footprints and lower computation complexity, which is crucial for efficient deployment. However, quantization inevitably leads to a distribution divergence from the original network, which generally degrades the performance. To tackle this issue, massive efforts have been made, but most existing approaches lack statistical considerations and depend on several manual configurations. In this paper, we present an adaptive-mapping quantization method to learn an optimal latent sub-distribution that is inherent within models and smoothly approximated with a concrete Gaussian Mixture (GM). In particular, the network weights are projected in compliance with the GM-approximated sub-distribution. This sub-distribution evolves along with the weight update in a co-tuning schema guided by the direct task-objective optimization. Sufficient experiments on image classification and object detection over various modern architectures demonstrate the effectiveness, generalization property, and transferability of the proposed method. Besides, an efficient deployment flow for the mobile CPU is developed, achieving up to 7.46$\\times$ inference acceleration on an octa-core ARM CPU. Our codes have been publicly released at https://github.com/RunpeiDong/DGMS.}\n}", "pdf": "https://proceedings.mlr.press/v162/dong22a/dong22a.pdf", "supp": "", "pdf_size": 900286, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7264575101488982108&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Xi'an Jiaotong University; Tsinghua University; Tsinghua University; Tsinghua University; Tsinghua University", "aff_domain": "xjtu.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn;mails.tsinghua.edu.cn;mail.tsinghua.edu.cn", "email": "xjtu.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn;mails.tsinghua.edu.cn;mail.tsinghua.edu.cn", "github": "https://github.com/RunpeiDong/DGMS", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/dong22a.html", "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Xi'an Jiao Tong University;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.xjtu.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "XJTU;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Finite-Sum Coupled Compositional Stochastic Optimization: Theory and Applications", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17485", "id": "17485", "proceeding": "https://proceedings.mlr.press/v162/wang22ak.html", "poster": "/media/PosterPDFs/ICML%202022/c54bc2ded4480856dc9f39bdcf35a3e7_WJLRH26.png?t=1657576056.6386273", "slides": "/media/icml-2022/Slides/17485_CNIOw4o.pdf", "author_site": "Bokun Wang, Tianbao Yang", "author": "Bokun Wang; Tianbao Yang", "abstract": "This paper studies stochastic optimization for a sum of compositional functions, where the inner-level function of each summand is coupled with the corresponding summation index. We refer to this family of problems as finite-sum coupled compositional optimization (FCCO). It has broad applications in machine learning for optimizing non-convex or convex compositional measures/objectives such as average precision (AP), p-norm push, listwise ranking losses, neighborhood component analysis (NCA), deep survival analysis, deep latent variable models, etc., which deserves finer analysis. Yet, existing algorithms and analyses are restricted in one or other aspects. The contribution of this paper is to provide a comprehensive convergence analysis of a simple stochastic algorithm for both non-convex and convex objectives. Our key result is the improved oracle complexity with the parallel speed-up by using the moving-average based estimator with mini-batching. Our theoretical analysis also exhibits new insights for improving the practical implementation by sampling the batches of equal size for the outer and inner levels. Numerical experiments on AP maximization, NCA, and p-norm push corroborate some aspects of the theory.", "bibtex": "@InProceedings{pmlr-v162-wang22ak,\n title = \t {Finite-Sum Coupled Compositional Stochastic Optimization: Theory and Applications},\n author = {Wang, Bokun and Yang, Tianbao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23292--23317},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22ak/wang22ak.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22ak.html},\n abstract = \t {This paper studies stochastic optimization for a sum of compositional functions, where the inner-level function of each summand is coupled with the corresponding summation index. We refer to this family of problems as finite-sum coupled compositional optimization (FCCO). It has broad applications in machine learning for optimizing non-convex or convex compositional measures/objectives such as average precision (AP), p-norm push, listwise ranking losses, neighborhood component analysis (NCA), deep survival analysis, deep latent variable models, etc., which deserves finer analysis. Yet, existing algorithms and analyses are restricted in one or other aspects. The contribution of this paper is to provide a comprehensive convergence analysis of a simple stochastic algorithm for both non-convex and convex objectives. Our key result is the improved oracle complexity with the parallel speed-up by using the moving-average based estimator with mini-batching. Our theoretical analysis also exhibits new insights for improving the practical implementation by sampling the batches of equal size for the outer and inner levels. Numerical experiments on AP maximization, NCA, and p-norm push corroborate some aspects of the theory.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22ak/wang22ak.pdf", "supp": "", "pdf_size": 19483573, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14656999129705224615&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, The University of Iowa, IA, USA; Department of Computer Science, The University of Iowa, IA, USA", "aff_domain": "gmail.com;uiowa.edu", "email": "gmail.com;uiowa.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/wang22ak.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Iowa", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.uiowa.edu", "aff_unique_abbr": "UIowa", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "First-Order Regret in Reinforcement Learning with Linear Function Approximation: A Robust Estimation Approach", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16443", "id": "16443", "proceeding": "https://proceedings.mlr.press/v162/wagenmaker22a.html", "poster": "/media/PosterPDFs/ICML%202022/cfe912f5cb3aa572bd1c9ae2a9b82207.png?t=1657626036.1487095", "slides": "", "author_site": "Andrew Wagenmaker, Yifang Chen, Max Simchowitz, Simon Du, Kevin Jamieson", "author": "Andrew J Wagenmaker; Yifang Chen; Max Simchowitz; Simon Du; Kevin Jamieson", "abstract": "Obtaining first-order regret bounds\u2014regret bounds scaling not as the worst-case but with some measure of the performance of the optimal policy on a given instance\u2014is a core question in sequential decision-making. While such bounds exist in many settings, they have proven elusive in reinforcement learning with large state spaces. In this work we address this gap, and show that it is possible to obtain regret scaling as $\\widetilde{\\mathcal{O}}(\\sqrt{d^3 H^3 \\cdot V_1^\\star \\cdot K} + d^{3.5}H^3\\log K )$ in reinforcement learning with large state spaces, namely the linear MDP setting. Here $V_1^\\star$ is the value of the optimal policy and $K$ is the number of episodes. We demonstrate that existing techniques based on least squares estimation are insufficient to obtain this result, and instead develop a novel robust self-normalized concentration bound based on the robust Catoni mean estimator, which may be of independent interest.", "bibtex": "@InProceedings{pmlr-v162-wagenmaker22a,\n title = \t {First-Order Regret in Reinforcement Learning with Linear Function Approximation: A Robust Estimation Approach},\n author = {Wagenmaker, Andrew J and Chen, Yifang and Simchowitz, Max and Du, Simon and Jamieson, Kevin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22384--22429},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wagenmaker22a/wagenmaker22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/wagenmaker22a.html},\n abstract = \t {Obtaining first-order regret bounds\u2014regret bounds scaling not as the worst-case but with some measure of the performance of the optimal policy on a given instance\u2014is a core question in sequential decision-making. While such bounds exist in many settings, they have proven elusive in reinforcement learning with large state spaces. In this work we address this gap, and show that it is possible to obtain regret scaling as $\\widetilde{\\mathcal{O}}(\\sqrt{d^3 H^3 \\cdot V_1^\\star \\cdot K} + d^{3.5}H^3\\log K )$ in reinforcement learning with large state spaces, namely the linear MDP setting. Here $V_1^\\star$ is the value of the optimal policy and $K$ is the number of episodes. We demonstrate that existing techniques based on least squares estimation are insufficient to obtain this result, and instead develop a novel robust self-normalized concentration bound based on the robust Catoni mean estimator, which may be of independent interest.}\n}", "pdf": "https://proceedings.mlr.press/v162/wagenmaker22a/wagenmaker22a.pdf", "supp": "", "pdf_size": 645186, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15203287407278710435&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle; Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle; CSAIL, MIT, Cambridge, MA; Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle; Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle", "aff_domain": "cs.washington.edu; ; ; ; ", "email": "cs.washington.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/wagenmaker22a.html", "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "University of Washington;Massachusetts Institute of Technology", "aff_unique_dep": "Paul G. Allen School of Computer Science and Engineering;Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.washington.edu;https://www.csail.mit.edu", "aff_unique_abbr": "UW;MIT", "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Seattle;Cambridge", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fisher SAM: Information Geometry and Sharpness Aware Minimisation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17841", "id": "17841", "proceeding": "https://proceedings.mlr.press/v162/kim22f.html", "poster": "/media/PosterPDFs/ICML%202022/0dbcf39d413231953d442f2f17f80cd5.png?t=1657629866.5915945", "slides": "", "author_site": "Minyoung Kim, Da Li, Xu Hu, Timothy Hospedales", "author": "Minyoung Kim; Da Li; Shell X Hu; Timothy Hospedales", "abstract": "Recent sharpness-aware minimisation (SAM) is known to find flat minima which is beneficial for better generalisation with improved robustness. SAM essentially modifies the loss function by the maximum loss value within the small neighborhood around the current iterate. However, it uses the Euclidean ball to define the neighborhood, which can be less accurate since loss functions for neural networks are typically defined over probability distributions (e.g., class predictive probabilities), rendering the parameter space no more Euclidean. In this paper we consider the information geometry of the model parameter space when defining the neighborhood, namely replacing SAM\u2019s Euclidean balls with ellipsoids induced by the Fisher information. Our approach, dubbed Fisher SAM, defines more accurate neighborhood structures that conform to the intrinsic metric of the underlying statistical manifold. For instance, SAM may probe the worst-case loss value at either a too nearby or inappropriately distant point due to the ignorance of the parameter space geometry, which is avoided by our Fisher SAM. Another recent Adaptive SAM approach that stretches/shrinks the Euclidean ball in accordance with the scales of the parameter magnitudes, might be dangerous, potentially destroying the neighborhood structure even severely. We demonstrate the improved performance of the proposed Fisher SAM on several benchmark datasets/tasks.", "bibtex": "@InProceedings{pmlr-v162-kim22f,\n title = \t {{F}isher {SAM}: Information Geometry and Sharpness Aware Minimisation},\n author = {Kim, Minyoung and Li, Da and Hu, Shell X and Hospedales, Timothy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11148--11161},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kim22f/kim22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/kim22f.html},\n abstract = \t {Recent sharpness-aware minimisation (SAM) is known to find flat minima which is beneficial for better generalisation with improved robustness. SAM essentially modifies the loss function by the maximum loss value within the small neighborhood around the current iterate. However, it uses the Euclidean ball to define the neighborhood, which can be less accurate since loss functions for neural networks are typically defined over probability distributions (e.g., class predictive probabilities), rendering the parameter space no more Euclidean. In this paper we consider the information geometry of the model parameter space when defining the neighborhood, namely replacing SAM\u2019s Euclidean balls with ellipsoids induced by the Fisher information. Our approach, dubbed Fisher SAM, defines more accurate neighborhood structures that conform to the intrinsic metric of the underlying statistical manifold. For instance, SAM may probe the worst-case loss value at either a too nearby or inappropriately distant point due to the ignorance of the parameter space geometry, which is avoided by our Fisher SAM. Another recent Adaptive SAM approach that stretches/shrinks the Euclidean ball in accordance with the scales of the parameter magnitudes, might be dangerous, potentially destroying the neighborhood structure even severely. We demonstrate the improved performance of the proposed Fisher SAM on several benchmark datasets/tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/kim22f/kim22f.pdf", "supp": "", "pdf_size": 754860, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2802835746667751278&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Samsung AI Center, Cambridge, UK; Samsung AI Center, Cambridge, UK; Samsung AI Center, Cambridge, UK; Samsung AI Center, Cambridge, UK + University of Edinburgh", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/kim22f.html", "aff_unique_index": "0;0;0;0+1", "aff_unique_norm": "Samsung;University of Edinburgh", "aff_unique_dep": "AI Center;", "aff_unique_url": "https://www.samsung.com/global/research-innovation/ai-research/;https://www.ed.ac.uk", "aff_unique_abbr": "SAC;Edinburgh", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0;0+0", "aff_country_unique": "United Kingdom" }, { "title": "Fishing for User Data in Large-Batch Federated Learning via Gradient Magnification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16787", "id": "16787", "proceeding": "https://proceedings.mlr.press/v162/wen22a.html", "poster": "/media/PosterPDFs/ICML%202022/99c83c904d0d64fbef50d919a5c66a80.png?t=1657939129.1920433", "slides": "", "author_site": "Yuxin Wen, Jonas Geiping, Liam Fowl, Micah Goldblum, Tom Goldstein", "author": "Yuxin Wen; Jonas A. Geiping; Liam Fowl; Micah Goldblum; Tom Goldstein", "abstract": "Federated learning (FL) has rapidly risen in popularity due to its promise of privacy and efficiency. Previous works have exposed privacy vulnerabilities in the FL pipeline by recovering user data from gradient updates. However, existing attacks fail to address realistic settings because they either 1) require toy settings with very small batch sizes, or 2) require unrealistic and conspicuous architecture modifications. We introduce a new strategy that dramatically elevates existing attacks to operate on batches of arbitrarily large size, and without architectural modifications. Our model-agnostic strategy only requires modifications to the model parameters sent to the user, which is a realistic threat model in many scenarios. We demonstrate the strategy in challenging large-scale settings, obtaining high-fidelity data extraction in both cross-device and cross-silo federated learning. Code is available at https://github.com/JonasGeiping/breaching.", "bibtex": "@InProceedings{pmlr-v162-wen22a,\n title = \t {Fishing for User Data in Large-Batch Federated Learning via Gradient Magnification},\n author = {Wen, Yuxin and Geiping, Jonas A. and Fowl, Liam and Goldblum, Micah and Goldstein, Tom},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23668--23684},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wen22a/wen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/wen22a.html},\n abstract = \t {Federated learning (FL) has rapidly risen in popularity due to its promise of privacy and efficiency. Previous works have exposed privacy vulnerabilities in the FL pipeline by recovering user data from gradient updates. However, existing attacks fail to address realistic settings because they either 1) require toy settings with very small batch sizes, or 2) require unrealistic and conspicuous architecture modifications. We introduce a new strategy that dramatically elevates existing attacks to operate on batches of arbitrarily large size, and without architectural modifications. Our model-agnostic strategy only requires modifications to the model parameters sent to the user, which is a realistic threat model in many scenarios. We demonstrate the strategy in challenging large-scale settings, obtaining high-fidelity data extraction in both cross-device and cross-silo federated learning. Code is available at https://github.com/JonasGeiping/breaching.}\n}", "pdf": "https://proceedings.mlr.press/v162/wen22a/wen22a.pdf", "supp": "", "pdf_size": 15651361, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11388041584211331417&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "University of Maryland; University of Maryland; University of Maryland; New York University; University of Maryland", "aff_domain": "umd.edu;umd.edu;umd.edu; ;umd.edu", "email": "umd.edu;umd.edu;umd.edu; ;umd.edu", "github": "https://github.com/JonasGeiping/breaching", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/wen22a.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of Maryland;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www/umd.edu;https://www.nyu.edu", "aff_unique_abbr": "UMD;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fishr: Invariant Gradient Variances for Out-of-Distribution Generalization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17213", "id": "17213", "proceeding": "https://proceedings.mlr.press/v162/rame22a.html", "poster": "/media/PosterPDFs/ICML%202022/8d317bdcf4aafcfc22149d77babee96d_codUyxT.png?t=1656576081.494801", "slides": "/media/icml-2022/Slides/17213.pdf", "author_site": "Alexandre Rame, Corentin Dancette, Matthieu Cord", "author": "Alexandre Rame; Corentin Dancette; Matthieu Cord", "abstract": "Learning robust models that generalize well under changes in the data distribution is critical for real-world applications. To this end, there has been a growing surge of interest to learn simultaneously from multiple training domains - while enforcing different types of invariance across those domains. Yet, all existing approaches fail to show systematic benefits under controlled evaluation protocols. In this paper, we introduce a new regularization - named Fishr - that enforces domain invariance in the space of the gradients of the loss: specifically, the domain-level variances of gradients are matched across training domains. Our approach is based on the close relations between the gradient covariance, the Fisher Information and the Hessian of the loss: in particular, we show that Fishr eventually aligns the domain-level loss landscapes locally around the final weights. Extensive experiments demonstrate the effectiveness of Fishr for out-of-distribution generalization. Notably, Fishr improves the state of the art on the DomainBed benchmark and performs consistently better than Empirical Risk Minimization. Our code is available at https://github.com/alexrame/fishr.", "bibtex": "@InProceedings{pmlr-v162-rame22a,\n title = \t {Fishr: Invariant Gradient Variances for Out-of-Distribution Generalization},\n author = {Rame, Alexandre and Dancette, Corentin and Cord, Matthieu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18347--18377},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rame22a/rame22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rame22a.html},\n abstract = \t {Learning robust models that generalize well under changes in the data distribution is critical for real-world applications. To this end, there has been a growing surge of interest to learn simultaneously from multiple training domains - while enforcing different types of invariance across those domains. Yet, all existing approaches fail to show systematic benefits under controlled evaluation protocols. In this paper, we introduce a new regularization - named Fishr - that enforces domain invariance in the space of the gradients of the loss: specifically, the domain-level variances of gradients are matched across training domains. Our approach is based on the close relations between the gradient covariance, the Fisher Information and the Hessian of the loss: in particular, we show that Fishr eventually aligns the domain-level loss landscapes locally around the final weights. Extensive experiments demonstrate the effectiveness of Fishr for out-of-distribution generalization. Notably, Fishr improves the state of the art on the DomainBed benchmark and performs consistently better than Empirical Risk Minimization. Our code is available at https://github.com/alexrame/fishr.}\n}", "pdf": "https://proceedings.mlr.press/v162/rame22a/rame22a.pdf", "supp": "", "pdf_size": 2384846, "gs_citation": 267, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12989883752146186165&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Sorbonne Universit \u00b4e, CNRS, LIP6, Paris, France; Sorbonne Universit \u00b4e, CNRS, LIP6, Paris, France; Sorbonne Universit \u00b4e, CNRS, LIP6, Paris, France + Valeo.ai", "aff_domain": "sorbonne-universite.fr; ; ", "email": "sorbonne-universite.fr; ; ", "github": "https://github.com/alexrame/fishr", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/rame22a.html", "aff_unique_index": "0;0;0+1", "aff_unique_norm": "Sorbonne Universit\u00e9;Valeo", "aff_unique_dep": "LIP6;Valeo.ai", "aff_unique_url": "https://www.sorbonne-universite.fr;https://www.valeo.com", "aff_unique_abbr": "SU;Valeo", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0;0;0+0", "aff_country_unique": "France" }, { "title": "Flashlight: Enabling Innovation in Tools for Machine Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17877", "id": "17877", "proceeding": "https://proceedings.mlr.press/v162/kahn22a.html", "poster": "/media/PosterPDFs/ICML%202022/976abf49974d4686f87192efa0513ae0_cV7Pdsl.png?t=1658264467.568007", "slides": "/media/icml-2022/Slides/17877_aadAdcP.pdf", "author_site": "Jacob Kahn, Vineel Pratap, Tatiana Likhomanenko, Qiantong Xu, Awni Hannun, Jeff Cai, Paden Tomasello, Ann Lee, Edouard Grave, Gilad Avidov, Benoit Steiner, Vitaliy Liptchinsky, Gabriel Synnaeve, Ronan Collobert", "author": "Jacob D Kahn; Vineel Pratap; Tatiana Likhomanenko; Qiantong Xu; Awni Hannun; Jeff Cai; Paden Tomasello; Ann Lee; Edouard Grave; Gilad Avidov; Benoit Steiner; Vitaliy Liptchinsky; Gabriel Synnaeve; Ronan Collobert", "abstract": "As the computational requirements for machine learning systems and the size and complexity of machine learning frameworks increases, essential framework innovation has become challenging. While computational needs have driven recent compiler, networking, and hardware advancements, utilization of those advancements by machine learning tools is occurring at a slower pace. This is in part due to the difficulties involved in prototyping new computational paradigms with existing frameworks. Large frameworks prioritize machine learning researchers and practitioners as end users and pay comparatively little attention to systems researchers who can push frameworks forward \u2014 we argue that both are equally important stakeholders. We introduce Flashlight, an open-source library built to spur innovation in machine learning tools and systems by prioritizing open, modular, customizable internals and state-of-the-art, research-ready models and training setups across a variety of domains. Flashlight allows systems researchers to rapidly prototype and experiment with novel ideas in machine learning computation and has low overhead, competing with and often outperforming other popular machine learning frameworks. We see Flashlight as a tool enabling research that can benefit widely used libraries downstream and bring machine learning and systems researchers closer together.", "bibtex": "@InProceedings{pmlr-v162-kahn22a,\n title = \t {Flashlight: Enabling Innovation in Tools for Machine Learning},\n author = {Kahn, Jacob D and Pratap, Vineel and Likhomanenko, Tatiana and Xu, Qiantong and Hannun, Awni and Cai, Jeff and Tomasello, Paden and Lee, Ann and Grave, Edouard and Avidov, Gilad and Steiner, Benoit and Liptchinsky, Vitaliy and Synnaeve, Gabriel and Collobert, Ronan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10557--10574},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kahn22a/kahn22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kahn22a.html},\n abstract = \t {As the computational requirements for machine learning systems and the size and complexity of machine learning frameworks increases, essential framework innovation has become challenging. While computational needs have driven recent compiler, networking, and hardware advancements, utilization of those advancements by machine learning tools is occurring at a slower pace. This is in part due to the difficulties involved in prototyping new computational paradigms with existing frameworks. Large frameworks prioritize machine learning researchers and practitioners as end users and pay comparatively little attention to systems researchers who can push frameworks forward \u2014 we argue that both are equally important stakeholders. We introduce Flashlight, an open-source library built to spur innovation in machine learning tools and systems by prioritizing open, modular, customizable internals and state-of-the-art, research-ready models and training setups across a variety of domains. Flashlight allows systems researchers to rapidly prototype and experiment with novel ideas in machine learning computation and has low overhead, competing with and often outperforming other popular machine learning frameworks. We see Flashlight as a tool enabling research that can benefit widely used libraries downstream and bring machine learning and systems researchers closer together.}\n}", "pdf": "https://proceedings.mlr.press/v162/kahn22a/kahn22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/kahn22a-supp.zip", "pdf_size": 276546, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13806487547053815832&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Facebook AI Research, Menlo Park, CA, U.S.A.+Currently at Apple, Cupertino, CA, U.S.A.; Facebook AI Research, Menlo Park, CA, U.S.A.; Facebook AI Research, Menlo Park, CA, U.S.A.+Currently at Apple, Cupertino, CA, U.S.A.; Facebook AI Research, Menlo Park, CA, U.S.A.; Zoom AI, San Jose, CA, U.S.A.; Facebook AI Research, Menlo Park, CA, U.S.A.; Facebook AI Research, Menlo Park, CA, U.S.A.; Facebook AI Research, Menlo Park, CA, U.S.A.; Facebook AI Research, Paris, France; Facebook, Menlo Park, CA, U.S.A.; Facebook AI Research, Menlo Park, CA, U.S.A.; Facebook AI Research, Menlo Park, CA, U.S.A.; Facebook AI Research, Paris, France; Facebook AI Research, Menlo Park, CA, U.S.A.+Currently at Apple, Cupertino, CA, U.S.A.", "aff_domain": "fb.com; ; ; ; ; ; ; ; ; ; ; ; ; ", "email": "fb.com; ; ; ; ; ; ; ; ; ; ; ; ; ", "github": "", "project": "Flashlight is available at this URL.", "author_num": 14, "oa": "https://proceedings.mlr.press/v162/kahn22a.html", "aff_unique_index": "0+1;0;0+1;0;2;0;0;0;0;0;0;0;0;0+1", "aff_unique_norm": "Meta;Apple;Zoom AI", "aff_unique_dep": "Facebook AI Research;Apple Inc.;AI Division", "aff_unique_url": "https://research.facebook.com;https://www.apple.com;https://zoom.ai", "aff_unique_abbr": "FAIR;Apple;Zoom AI", "aff_campus_unique_index": "0+1;0;0+1;0;2;0;0;0;3;0;0;0;3;0+1", "aff_campus_unique": "Menlo Park;Cupertino;San Jose;Paris", "aff_country_unique_index": "0+0;0;0+0;0;0;0;0;0;1;0;0;0;1;0+0", "aff_country_unique": "United States;France" }, { "title": "Flow-Guided Sparse Transformer for Video Deblurring", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16807", "id": "16807", "proceeding": "https://proceedings.mlr.press/v162/lin22a.html", "poster": "/media/PosterPDFs/ICML%202022/5878a7ab84fb43402106c575658472fa_mVTYGls.png?t=1657246073.3179455", "slides": "/media/icml-2022/Slides/16807.pdf", "author_site": "Jing Lin, Yuanhao Cai, Xiaowan Hu, Haoqian Wang, Youliang Yan, Xueyi Zou, Henghui Ding, Yulun Zhang, Radu Timofte, Luc Van Gool", "author": "Jing Lin; Yuanhao Cai; Xiaowan Hu; Haoqian Wang; Youliang Yan; Xueyi Zou; Henghui Ding; Yulun Zhang; Radu Timofte; Luc Van Gool", "abstract": "Exploiting similar and sharper scene patches in spatio-temporal neighborhoods is critical for video deblurring. However, CNN-based methods show limitations in capturing long-range dependencies and modeling non-local self-similarity. In this paper, we propose a novel framework, Flow-Guided Sparse Transformer (FGST), for video deblurring. In FGST, we customize a self-attention module, Flow-Guided Sparse Window-based Multi-head Self-Attention (FGSW-MSA). For each $query$ element on the blurry reference frame, FGSW-MSA enjoys the guidance of the estimated optical flow to globally sample spatially sparse yet highly related $key$ elements corresponding to the same scene patch in neighboring frames. Besides, we present a Recurrent Embedding (RE) mechanism to transfer information from past frames and strengthen long-range temporal dependencies. Comprehensive experiments demonstrate that our proposed FGST outperforms state-of-the-art (SOTA) methods on both DVD and GOPRO datasets and yields visually pleasant results in real video deblurring. https://github.com/linjing7/VR-Baseline", "bibtex": "@InProceedings{pmlr-v162-lin22a,\n title = \t {Flow-Guided Sparse Transformer for Video Deblurring},\n author = {Lin, Jing and Cai, Yuanhao and Hu, Xiaowan and Wang, Haoqian and Yan, Youliang and Zou, Xueyi and Ding, Henghui and Zhang, Yulun and Timofte, Radu and Van Gool, Luc},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13334--13343},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lin22a/lin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lin22a.html},\n abstract = \t {Exploiting similar and sharper scene patches in spatio-temporal neighborhoods is critical for video deblurring. However, CNN-based methods show limitations in capturing long-range dependencies and modeling non-local self-similarity. In this paper, we propose a novel framework, Flow-Guided Sparse Transformer (FGST), for video deblurring. In FGST, we customize a self-attention module, Flow-Guided Sparse Window-based Multi-head Self-Attention (FGSW-MSA). For each $query$ element on the blurry reference frame, FGSW-MSA enjoys the guidance of the estimated optical flow to globally sample spatially sparse yet highly related $key$ elements corresponding to the same scene patch in neighboring frames. Besides, we present a Recurrent Embedding (RE) mechanism to transfer information from past frames and strengthen long-range temporal dependencies. Comprehensive experiments demonstrate that our proposed FGST outperforms state-of-the-art (SOTA) methods on both DVD and GOPRO datasets and yields visually pleasant results in real video deblurring. https://github.com/linjing7/VR-Baseline}\n}", "pdf": "https://proceedings.mlr.press/v162/lin22a/lin22a.pdf", "supp": "", "pdf_size": 13129473, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14219657862279161517&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Shenzhen International Graduate School, Tsinghua University+ETH Z\u00fcrich; Shenzhen International Graduate School, Tsinghua University+ETH Z\u00fcrich; Shenzhen International Graduate School, Tsinghua University; Shenzhen International Graduate School, Tsinghua University; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; ETH Z\u00fcrich; ETH Z\u00fcrich; ETH Z\u00fcrich; ETH Z\u00fcrich", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;sz.tsinghua.edu.cn;tsinghua.edu.cn;huawei.com;huawei.com;ethz.ch;ethz.ch;inf.ethz.ch;vision.ee.ethz.ch", "email": "tsinghua.edu.cn;tsinghua.edu.cn;sz.tsinghua.edu.cn;tsinghua.edu.cn;huawei.com;huawei.com;ethz.ch;ethz.ch;inf.ethz.ch;vision.ee.ethz.ch", "github": "https://github.com/linjing7/VR-Baseline", "project": "", "author_num": 10, "oa": "https://proceedings.mlr.press/v162/lin22a.html", "aff_unique_index": "0+1;0+1;0;0;2;2;1;1;1;1", "aff_unique_norm": "Tsinghua University;ETH Zurich;Huawei", "aff_unique_dep": "Shenzhen International Graduate School;;Noah\u2019s Ark Lab", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ethz.ch;https://www.huawei.com", "aff_unique_abbr": "THU;ETHZ;Huawei", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0+1;0+1;0;0;0;0;1;1;1;1", "aff_country_unique": "China;Switzerland" }, { "title": "Flow-based Recurrent Belief State Learning for POMDPs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17125", "id": "17125", "proceeding": "https://proceedings.mlr.press/v162/chen22q.html", "poster": "/media/PosterPDFs/ICML%202022/c9d9edbf9b9e23eb5d4819bbcce9b078.png?t=1657542538.292317", "slides": "", "author_site": "Xiaoyu Chen, Yao Mu, Ping Luo, Shengbo Li, Jianyu Chen", "author": "Xiaoyu Chen; Yao Mark Mu; Ping Luo; Shengbo Li; Jianyu Chen", "abstract": "Partially Observable Markov Decision Process (POMDP) provides a principled and generic framework to model real world sequential decision making processes but yet remains unsolved, especially for high dimensional continuous space and unknown models. The main challenge lies in how to accurately obtain the belief state, which is the probability distribution over the unobservable environment states given historical information. Accurately calculating this belief state is a precondition for obtaining an optimal policy of POMDPs. Recent advances in deep learning techniques show great potential to learn good belief states. However, existing methods can only learn approximated distribution with limited flexibility. In this paper, we introduce the \\textbf{F}l\\textbf{O}w-based \\textbf{R}ecurrent \\textbf{BE}lief \\textbf{S}tate model (FORBES), which incorporates normalizing flows into the variational inference to learn general continuous belief states for POMDPs. Furthermore, we show that the learned belief states can be plugged into downstream RL algorithms to improve performance. In experiments, we show that our methods successfully capture the complex belief states that enable multi-modal predictions as well as high quality reconstructions, and results on challenging visual-motor control tasks show that our method achieves superior performance and sample efficiency.", "bibtex": "@InProceedings{pmlr-v162-chen22q,\n title = \t {Flow-based Recurrent Belief State Learning for {POMDP}s},\n author = {Chen, Xiaoyu and Mu, Yao Mark and Luo, Ping and Li, Shengbo and Chen, Jianyu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3444--3468},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22q/chen22q.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22q.html},\n abstract = \t {Partially Observable Markov Decision Process (POMDP) provides a principled and generic framework to model real world sequential decision making processes but yet remains unsolved, especially for high dimensional continuous space and unknown models. The main challenge lies in how to accurately obtain the belief state, which is the probability distribution over the unobservable environment states given historical information. Accurately calculating this belief state is a precondition for obtaining an optimal policy of POMDPs. Recent advances in deep learning techniques show great potential to learn good belief states. However, existing methods can only learn approximated distribution with limited flexibility. In this paper, we introduce the \\textbf{F}l\\textbf{O}w-based \\textbf{R}ecurrent \\textbf{BE}lief \\textbf{S}tate model (FORBES), which incorporates normalizing flows into the variational inference to learn general continuous belief states for POMDPs. Furthermore, we show that the learned belief states can be plugged into downstream RL algorithms to improve performance. In experiments, we show that our methods successfully capture the complex belief states that enable multi-modal predictions as well as high quality reconstructions, and results on challenging visual-motor control tasks show that our method achieves superior performance and sample efficiency.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22q/chen22q.pdf", "supp": "", "pdf_size": 6652538, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5528357059377876910&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/chen22q.html" }, { "title": "Flowformer: Linearizing Transformers with Conservation Flows", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17257", "id": "17257", "proceeding": "https://proceedings.mlr.press/v162/wu22m.html", "poster": "/media/PosterPDFs/ICML%202022/01882513d5fa7c329e940dda99b12147.png?t=1657439815.4022634", "slides": "", "author_site": "Haixu Wu, Jialong Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long", "author": "Haixu Wu; Jialong Wu; Jiehui Xu; Jianmin Wang; Mingsheng Long", "abstract": "Transformers based on the attention mechanism have achieved impressive success in various areas. However, the attention mechanism has a quadratic complexity, significantly impeding Transformers from dealing with numerous tokens and scaling up to bigger models. Previous methods mainly utilize the similarity decomposition and the associativity of matrix multiplication to devise linear-time attention mechanisms. They avoid degeneration of attention to a trivial distribution by reintroducing inductive biases such as the locality, thereby at the expense of model generality and expressiveness. In this paper, we linearize Transformers free from specific inductive biases based on the flow network theory. We cast attention as the information flow aggregated from the sources (values) to the sinks (results) through the learned flow capacities (attentions). Within this framework, we apply the property of flow conservation into attention and propose the Flow-Attention mechanism of linear complexity. By respectively conserving the incoming flow of sinks for source competition and the outgoing flow of sources for sink allocation, Flow-Attention inherently generates informative attentions without using specific inductive biases. Empowered by the Flow-Attention, Flowformer yields strong performance in linear time for wide areas, including long sequence, time series, vision, natural language, and reinforcement learning. The code and settings are available at this repository: https://github.com/thuml/Flowformer.", "bibtex": "@InProceedings{pmlr-v162-wu22m,\n title = \t {Flowformer: Linearizing Transformers with Conservation Flows},\n author = {Wu, Haixu and Wu, Jialong and Xu, Jiehui and Wang, Jianmin and Long, Mingsheng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24226--24242},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22m/wu22m.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22m.html},\n abstract = \t {Transformers based on the attention mechanism have achieved impressive success in various areas. However, the attention mechanism has a quadratic complexity, significantly impeding Transformers from dealing with numerous tokens and scaling up to bigger models. Previous methods mainly utilize the similarity decomposition and the associativity of matrix multiplication to devise linear-time attention mechanisms. They avoid degeneration of attention to a trivial distribution by reintroducing inductive biases such as the locality, thereby at the expense of model generality and expressiveness. In this paper, we linearize Transformers free from specific inductive biases based on the flow network theory. We cast attention as the information flow aggregated from the sources (values) to the sinks (results) through the learned flow capacities (attentions). Within this framework, we apply the property of flow conservation into attention and propose the Flow-Attention mechanism of linear complexity. By respectively conserving the incoming flow of sinks for source competition and the outgoing flow of sources for sink allocation, Flow-Attention inherently generates informative attentions without using specific inductive biases. Empowered by the Flow-Attention, Flowformer yields strong performance in linear time for wide areas, including long sequence, time series, vision, natural language, and reinforcement learning. The code and settings are available at this repository: https://github.com/thuml/Flowformer.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22m/wu22m.pdf", "supp": "", "pdf_size": 8474580, "gs_citation": 127, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13534095276250575794&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "School of Software, BNRist, Tsinghua University; School of Software, BNRist, Tsinghua University; School of Software, BNRist, Tsinghua University; School of Software, BNRist, Tsinghua University; School of Software, BNRist, Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn; ; ; ;tsinghua.edu.cn", "email": "mails.tsinghua.edu.cn; ; ; ;tsinghua.edu.cn", "github": "https://github.com/thuml/Flowformer", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/wu22m.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "School of Software", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Fluctuations, Bias, Variance & Ensemble of Learners: Exact Asymptotics for Convex Losses in High-Dimension", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16331", "id": "16331", "proceeding": "https://proceedings.mlr.press/v162/loureiro22a.html", "poster": "/media/PosterPDFs/ICML%202022/e8bf0f27d70d480d3ab793bb7619aaa5.png?t=1656583293.5359895", "slides": "/media/icml-2022/Slides/16331_Bvn1VqK.pdf", "author_site": "Bruno Loureiro, Cedric Gerbelot, Maria Refinetti, Gabriele Sicuro, FLORENT KRZAKALA", "author": "Bruno Loureiro; Cedric Gerbelot; Maria Refinetti; Gabriele Sicuro; Florent Krzakala", "abstract": "From the sampling of data to the initialisation of parameters, randomness is ubiquitous in modern Machine Learning practice. Understanding the statistical fluctuations engendered by the different sources of randomness in prediction is therefore key to understanding robust generalisation. In this manuscript we develop a quantitative and rigorous theory for the study of fluctuations in an ensemble of generalised linear models trained on different, but correlated, features in high-dimensions. In particular, we provide a complete description of the asymptotic joint distribution of the empirical risk minimiser for generic convex loss and regularisation in the high-dimensional limit. Our result encompasses a rich set of classification and regression tasks, such as the lazy regime of overparametrised neural networks, or equivalently the random features approximation of kernels. While allowing to study directly the mitigating effect of ensembling (or bagging) on the bias-variance decomposition of the test error, our analysis also helps disentangle the contribution of statistical fluctuations, and the singular role played by the interpolation threshold that are at the roots of the \u201cdouble-descent\u201d phenomenon.", "bibtex": "@InProceedings{pmlr-v162-loureiro22a,\n title = \t {Fluctuations, Bias, Variance & Ensemble of Learners: Exact Asymptotics for Convex Losses in High-Dimension},\n author = {Loureiro, Bruno and Gerbelot, Cedric and Refinetti, Maria and Sicuro, Gabriele and Krzakala, Florent},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14283--14314},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/loureiro22a/loureiro22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/loureiro22a.html},\n abstract = \t {From the sampling of data to the initialisation of parameters, randomness is ubiquitous in modern Machine Learning practice. Understanding the statistical fluctuations engendered by the different sources of randomness in prediction is therefore key to understanding robust generalisation. In this manuscript we develop a quantitative and rigorous theory for the study of fluctuations in an ensemble of generalised linear models trained on different, but correlated, features in high-dimensions. In particular, we provide a complete description of the asymptotic joint distribution of the empirical risk minimiser for generic convex loss and regularisation in the high-dimensional limit. Our result encompasses a rich set of classification and regression tasks, such as the lazy regime of overparametrised neural networks, or equivalently the random features approximation of kernels. While allowing to study directly the mitigating effect of ensembling (or bagging) on the bias-variance decomposition of the test error, our analysis also helps disentangle the contribution of statistical fluctuations, and the singular role played by the interpolation threshold that are at the roots of the \u201cdouble-descent\u201d phenomenon.}\n}", "pdf": "https://proceedings.mlr.press/v162/loureiro22a/loureiro22a.pdf", "supp": "", "pdf_size": 5718281, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12733418833847323328&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/loureiro22a.html" }, { "title": "For Learning in Symmetric Teams, Local Optima are Global Nash Equilibria", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15977", "id": "15977", "proceeding": "https://proceedings.mlr.press/v162/emmons22a.html", "poster": "/media/PosterPDFs/ICML%202022/0b1ec366924b26fc98fa7b71a9c249cf.png?t=1658331517.3179421", "slides": "", "author_site": "Scott Emmons, Caspar Oesterheld, Andrew Critch, Vincent Conitzer, Stuart Russell", "author": "Scott Emmons; Caspar Oesterheld; Andrew Critch; Vincent Conitzer; Stuart Russell", "abstract": "Although it has been known since the 1970s that a", "bibtex": "@InProceedings{pmlr-v162-emmons22a,\n title = \t {For Learning in Symmetric Teams, Local Optima are Global {N}ash Equilibria},\n author = {Emmons, Scott and Oesterheld, Caspar and Critch, Andrew and Conitzer, Vincent and Russell, Stuart},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5924--5943},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/emmons22a/emmons22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/emmons22a.html},\n abstract = \t {Although it has been known since the 1970s that a", "pdf": "https://proceedings.mlr.press/v162/emmons22a/emmons22a.pdf", "supp": "", "pdf_size": 562988, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16109782432543935692&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "University of California, Berkeley; Carnegie Mellon University; University of California, Berkeley; Duke University, University of Oxford; University of California, Berkeley", "aff_domain": "berkeley.edu; ; ; ; ", "email": "berkeley.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/emmons22a.html", "aff_unique_index": "0;1;0;2;0", "aff_unique_norm": "University of California, Berkeley;Carnegie Mellon University;Duke University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.cmu.edu;https://www.duke.edu", "aff_unique_abbr": "UC Berkeley;CMU;Duke", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Forget-free Continual Learning with Winning Subnetworks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17873", "id": "17873", "proceeding": "https://proceedings.mlr.press/v162/kang22b.html", "poster": "/media/PosterPDFs/ICML%202022/c30fb4dc55d801fc7473840b5b161dfa_CUc9hVy.png?t=1658043709.3783274", "slides": "", "author_site": "Haeyong Kang, Rusty Mina, Sultan Rizky Hikmawan Madjid, Jaehong Yoon, Mark Hasegawa-Johnson, Sung Ju Hwang, Chang Yoo", "author": "Haeyong Kang; Rusty John Lloyd Mina; Sultan Rizky Hikmawan Madjid; Jaehong Yoon; Mark Hasegawa-Johnson; Sung Ju Hwang; Chang D. Yoo", "abstract": "Inspired by Lottery Ticket Hypothesis that competitive subnetworks exist within a dense network, we propose a continual learning method referred to as Winning SubNetworks (WSN), which sequentially learns and selects an optimal subnetwork for each task. Specifically, WSN jointly learns the model weights and task-adaptive binary masks pertaining to subnetworks associated with each task whilst attempting to select a small set of weights to be activated (winning ticket) by reusing weights of the prior subnetworks. The proposed method is inherently immune to catastrophic forgetting as each selected subnetwork model does not infringe upon other subnetworks. Binary masks spawned per winning ticket are encoded into one N-bit binary digit mask, then compressed using Huffman coding for a sub-linear increase in network capacity with respect to the number of tasks.", "bibtex": "@InProceedings{pmlr-v162-kang22b,\n title = \t {Forget-free Continual Learning with Winning Subnetworks},\n author = {Kang, Haeyong and Mina, Rusty John Lloyd and Madjid, Sultan Rizky Hikmawan and Yoon, Jaehong and Hasegawa-Johnson, Mark and Hwang, Sung Ju and Yoo, Chang D.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10734--10750},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kang22b/kang22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/kang22b.html},\n abstract = \t {Inspired by Lottery Ticket Hypothesis that competitive subnetworks exist within a dense network, we propose a continual learning method referred to as Winning SubNetworks (WSN), which sequentially learns and selects an optimal subnetwork for each task. Specifically, WSN jointly learns the model weights and task-adaptive binary masks pertaining to subnetworks associated with each task whilst attempting to select a small set of weights to be activated (winning ticket) by reusing weights of the prior subnetworks. The proposed method is inherently immune to catastrophic forgetting as each selected subnetwork model does not infringe upon other subnetworks. Binary masks spawned per winning ticket are encoded into one N-bit binary digit mask, then compressed using Huffman coding for a sub-linear increase in network capacity with respect to the number of tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/kang22b/kang22b.pdf", "supp": "", "pdf_size": 2403121, "gs_citation": 147, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12641625179791293746&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Korea Advanced Institute of Science and Technology (KAIST), South Korea; Korea Advanced Institute of Science and Technology (KAIST), South Korea; Korea Advanced Institute of Science and Technology (KAIST), South Korea; Korea Advanced Institute of Science and Technology (KAIST), South Korea; University of Illinois at Urbana-Champaign, USA; Korea Advanced Institute of Science and Technology (KAIST), South Korea + AITRICS, South Korea; Korea Advanced Institute of Science and Technology (KAIST), South Korea", "aff_domain": "kaist.ac.kr; ; ; ; ; ;kaist.ac.kr", "email": "kaist.ac.kr; ; ; ; ; ;kaist.ac.kr", "github": "https://github.com/ihaeyong/WSN", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/kang22b.html", "aff_unique_index": "0;0;0;0;1;0+2;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Illinois Urbana-Champaign;AITRICS", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaist.ac.kr;https://illinois.edu;", "aff_unique_abbr": "KAIST;UIUC;", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;1;0+0;0", "aff_country_unique": "South Korea;United States" }, { "title": "Forward Operator Estimation in Generative Models with Kernel Transfer Operators", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16995", "id": "16995", "proceeding": "https://proceedings.mlr.press/v162/huang22b.html", "poster": "/media/PosterPDFs/ICML%202022/a667f4e7b0c8a3babe331569d3eac6bd.png?t=1657747188.2782056", "slides": "", "author_site": "Zhichun Huang, Rudrasis Chakraborty, Vikas Singh", "author": "Zhichun Huang; Rudrasis Chakraborty; Vikas Singh", "abstract": "Generative models which use explicit density modeling (e.g., variational autoencoders, flow-based generative models) involve finding a mapping from a known distribution, e.g. Gaussian, to the unknown input distribution. This often requires searching over a class of non-linear functions (e.g., representable by a deep neural network). While effective in practice, the associated runtime/memory costs can increase rapidly, usually as a function of the performance desired in an application. We propose a substantially cheaper (and simpler) forward operator estimation strategy based on adapting known results on kernel transfer operators. We show that our formulation enables highly efficient distribution approximation and sampling, and offers surprisingly good empirical performance that compares favorably with powerful baselines, but with significant runtime savings. We show that the algorithm also performs well in small sample size settings (in brain imaging).", "bibtex": "@InProceedings{pmlr-v162-huang22b,\n title = \t {Forward Operator Estimation in Generative Models with Kernel Transfer Operators},\n author = {Huang, Zhichun and Chakraborty, Rudrasis and Singh, Vikas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9148--9172},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22b/huang22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22b.html},\n abstract = \t {Generative models which use explicit density modeling (e.g., variational autoencoders, flow-based generative models) involve finding a mapping from a known distribution, e.g. Gaussian, to the unknown input distribution. This often requires searching over a class of non-linear functions (e.g., representable by a deep neural network). While effective in practice, the associated runtime/memory costs can increase rapidly, usually as a function of the performance desired in an application. We propose a substantially cheaper (and simpler) forward operator estimation strategy based on adapting known results on kernel transfer operators. We show that our formulation enables highly efficient distribution approximation and sampling, and offers surprisingly good empirical performance that compares favorably with powerful baselines, but with significant runtime savings. We show that the algorithm also performs well in small sample size settings (in brain imaging).}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22b/huang22b.pdf", "supp": "", "pdf_size": 23177365, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8492120801119421306&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Carnegie Mellon University, Pittsburgh PA, USA; Butlr Inc., Burlingame CA, USA; University of Wisconsin-Madison, Madison WI, USA", "aff_domain": "cs.cmu.edu; ; ", "email": "cs.cmu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/huang22b.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Carnegie Mellon University;Butlr Inc.;University of Wisconsin-Madison", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;;https://www.wisc.edu", "aff_unique_abbr": "CMU;;UW-Madison", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Pittsburgh;;Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Fourier Learning with Cyclical Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16603", "id": "16603", "proceeding": "https://proceedings.mlr.press/v162/yang22o.html", "poster": "/media/PosterPDFs/ICML%202022/c45147dee729311ef5b5c3003946c48f_9VAETWy.png?t=1657751848.6385958", "slides": "", "author_site": "Yingxiang Yang, Zhihan Xiong, Tianyi Liu, Taiqing Wang, Chong Wang", "author": "Yingxiang Yang; Zhihan Xiong; Tianyi Liu; Taiqing Wang; Chong Wang", "abstract": "Many machine learning models for online applications, such as recommender systems, are often trained on data with cyclical properties. These data sequentially arrive from a time-varying distribution that is periodic in time. Existing algorithms either use streaming learning to track a time-varying set of optimal model parameters, yielding a dynamic regret that scales linearly in time; or partition the data of each cycle into multiple segments and train a separate model for each\u2014a pluralistic approach that is computationally and storage-wise expensive. In this paper, we have designed a novel approach to overcome the aforementioned shortcomings. Our method, named \"Fourier learning\", encodes the periodicity into the model representation using a partial Fourier sequence, and trains the coefficient functions modeled by neural networks. Particularly, we design a Fourier multi-layer perceptron (F-MLP) that can be trained on streaming data with stochastic gradient descent (streaming-SGD), and we derive its convergence guarantees. We demonstrate Fourier learning\u2019s better performance with extensive experiments on synthetic and public datasets, as well as on a large-scale recommender system that is updated in real-time, and trained with tens of millions of samples per day.", "bibtex": "@InProceedings{pmlr-v162-yang22o,\n title = \t {{F}ourier Learning with Cyclical Data},\n author = {Yang, Yingxiang and Xiong, Zhihan and Liu, Tianyi and Wang, Taiqing and Wang, Chong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25280--25301},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22o/yang22o.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22o.html},\n abstract = \t {Many machine learning models for online applications, such as recommender systems, are often trained on data with cyclical properties. These data sequentially arrive from a time-varying distribution that is periodic in time. Existing algorithms either use streaming learning to track a time-varying set of optimal model parameters, yielding a dynamic regret that scales linearly in time; or partition the data of each cycle into multiple segments and train a separate model for each\u2014a pluralistic approach that is computationally and storage-wise expensive. In this paper, we have designed a novel approach to overcome the aforementioned shortcomings. Our method, named \"Fourier learning\", encodes the periodicity into the model representation using a partial Fourier sequence, and trains the coefficient functions modeled by neural networks. Particularly, we design a Fourier multi-layer perceptron (F-MLP) that can be trained on streaming data with stochastic gradient descent (streaming-SGD), and we derive its convergence guarantees. We demonstrate Fourier learning\u2019s better performance with extensive experiments on synthetic and public datasets, as well as on a large-scale recommender system that is updated in real-time, and trained with tens of millions of samples per day.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22o/yang22o.pdf", "supp": "", "pdf_size": 954699, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10620642397467556889&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "ByteDance Inc; Paul G. Allen School of Computer Science & Engineering, University of Washington, WA; ByteDance Inc; ByteDance Inc; ByteDance Inc", "aff_domain": "bytedance.com;bytedance.com;bytedance.com;bytedance.com;bytedance.com", "email": "bytedance.com;bytedance.com;bytedance.com;bytedance.com;bytedance.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/yang22o.html", "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "ByteDance;University of Washington", "aff_unique_dep": ";Paul G. Allen School of Computer Science & Engineering", "aff_unique_url": "https://www.bytedance.com;https://www.washington.edu", "aff_unique_abbr": "ByteDance;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Framework for Evaluating Faithfulness of Local Explanations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17445", "id": "17445", "proceeding": "https://proceedings.mlr.press/v162/dasgupta22a.html", "poster": "/media/PosterPDFs/ICML%202022/9701a1c165dd9420816bfec5edd6c2b1.png?t=1657165832.5745761", "slides": "", "author_site": "Sanjoy Dasgupta, Nave Frost, Michal Moshkovitz", "author": "Sanjoy Dasgupta; Nave Frost; Michal Moshkovitz", "abstract": "We study the faithfulness of an explanation system to the underlying prediction model. We show that this can be captured by two properties, consistency and sufficiency, and introduce quantitative measures of the extent to which these hold. Interestingly, these measures depend on the test-time data distribution. For a variety of existing explanation systems, such as anchors, we analytically study these quantities. We also provide estimators and sample complexity bounds for empirically determining the faithfulness of black-box explanation systems. Finally, we experimentally validate the new properties and estimators.", "bibtex": "@InProceedings{pmlr-v162-dasgupta22a,\n title = \t {Framework for Evaluating Faithfulness of Local Explanations},\n author = {Dasgupta, Sanjoy and Frost, Nave and Moshkovitz, Michal},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4794--4815},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dasgupta22a/dasgupta22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dasgupta22a.html},\n abstract = \t {We study the faithfulness of an explanation system to the underlying prediction model. We show that this can be captured by two properties, consistency and sufficiency, and introduce quantitative measures of the extent to which these hold. Interestingly, these measures depend on the test-time data distribution. For a variety of existing explanation systems, such as anchors, we analytically study these quantities. We also provide estimators and sample complexity bounds for empirically determining the faithfulness of black-box explanation systems. Finally, we experimentally validate the new properties and estimators.}\n}", "pdf": "https://proceedings.mlr.press/v162/dasgupta22a/dasgupta22a.pdf", "supp": "", "pdf_size": 4944586, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4915988109423340381&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "University of California San Diego; Tel-Aviv University; Tel-Aviv University", "aff_domain": "eng.ucsd.edu;mail.tau.ac.il;mail.tau.ac.il", "email": "eng.ucsd.edu;mail.tau.ac.il;mail.tau.ac.il", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/dasgupta22a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "University of California, San Diego;Tel Aviv University", "aff_unique_dep": ";", "aff_unique_url": "https://ucsd.edu;https://www.tau.ac.il", "aff_unique_abbr": "UCSD;TAU", "aff_campus_unique_index": "0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Israel" }, { "title": "FriendlyCore: Practical Differentially Private Aggregation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16323", "id": "16323", "proceeding": "https://proceedings.mlr.press/v162/tsfadia22a.html", "poster": "/media/PosterPDFs/ICML%202022/59eb5dd36914c29b299c84b7ddaf08ec_ML2Ymgk.png?t=1657452357.6793272", "slides": "", "author_site": "Eliad Tsfadia, Edith Cohen, Haim Kaplan, Yishay Mansour, Uri Stemmer", "author": "Eliad Tsfadia; Edith Cohen; Haim Kaplan; Yishay Mansour; Uri Stemmer", "abstract": "Differentially private algorithms for common metric aggregation tasks, such as clustering or averaging, often have limited practicality due to their complexity or to the large number of data points that is required for accurate results. We propose a simple and practical tool $\\mathsf{FriendlyCore}$ that takes a set of points ${\\cal D}$ from an unrestricted (pseudo) metric space as input. When ${\\cal D}$ has effective diameter $r$, $\\mathsf{FriendlyCore}$ returns a \u201cstable\u201d subset ${\\cal C} \\subseteq {\\cal D}$ that includes all points, except possibly few outliers, and is", "bibtex": "@InProceedings{pmlr-v162-tsfadia22a,\n title = \t {{F}riendly{C}ore: Practical Differentially Private Aggregation},\n author = {Tsfadia, Eliad and Cohen, Edith and Kaplan, Haim and Mansour, Yishay and Stemmer, Uri},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21828--21863},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tsfadia22a/tsfadia22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tsfadia22a.html},\n abstract = \t {Differentially private algorithms for common metric aggregation tasks, such as clustering or averaging, often have limited practicality due to their complexity or to the large number of data points that is required for accurate results. We propose a simple and practical tool $\\mathsf{FriendlyCore}$ that takes a set of points ${\\cal D}$ from an unrestricted (pseudo) metric space as input. When ${\\cal D}$ has effective diameter $r$, $\\mathsf{FriendlyCore}$ returns a \u201cstable\u201d subset ${\\cal C} \\subseteq {\\cal D}$ that includes all points, except possibly few outliers, and is", "pdf": "https://proceedings.mlr.press/v162/tsfadia22a/tsfadia22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/tsfadia22a-supp.zip", "pdf_size": 1048791, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14192845292027877569&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Google Research + Blavatnik School of Computer Science, Tel Aviv University; Google Research + Blavatnik School of Computer Science, Tel Aviv University; Blavatnik School of Computer Science, Tel Aviv University; Blavatnik School of Computer Science, Tel Aviv University; Blavatnik School of Computer Science, Tel Aviv University", "aff_domain": "gmail.com; ; ; ; ", "email": "gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/tsfadia22a.html", "aff_unique_index": "0+1;0+1;1;1;1", "aff_unique_norm": "Google;Tel Aviv University", "aff_unique_dep": "Google Research;Blavatnik School of Computer Science", "aff_unique_url": "https://research.google;https://www.tau.ac.il", "aff_unique_abbr": "Google Research;TAU", "aff_campus_unique_index": "0+1;0+1;1;1;1", "aff_campus_unique": "Mountain View;Tel Aviv", "aff_country_unique_index": "0+1;0+1;1;1;1", "aff_country_unique": "United States;Israel" }, { "title": "From Dirichlet to Rubin: Optimistic Exploration in RL without Bonuses", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16389", "id": "16389", "proceeding": "https://proceedings.mlr.press/v162/tiapkin22a.html", "poster": "/media/PosterPDFs/ICML%202022/9715d04413f296eaf3c30c47cec3daa6.png?t=1657793570.388583", "slides": "", "author_site": "Daniil Tiapkin, Denis Belomestny, Eric Moulines, Alexey Naumov, Sergey Samsonov, Yunhao Tang, Michal Valko, Pierre Menard", "author": "Daniil Tiapkin; Denis Belomestny; Eric Moulines; Alexey Naumov; Sergey Samsonov; Yunhao Tang; Michal Valko; Pierre Menard", "abstract": "We propose the Bayes-UCBVI algorithm for reinforcement learning in tabular, stage-dependent, episodic Markov decision process: a natural extension of the Bayes-UCB algorithm by Kaufmann et al. 2012 for multi-armed bandits. Our method uses the quantile of a Q-value function posterior as upper confidence bound on the optimal Q-value function. For Bayes-UCBVI, we prove a regret bound of order $\\widetilde{\\mathcal{O}}(\\sqrt{H^3SAT})$ where $H$ is the length of one episode, $S$ is the number of states, $A$ the number of actions, $T$ the number of episodes, that matches the lower-bound of $\\Omega(\\sqrt{H^3SAT})$ up to poly-$\\log$ terms in $H,S,A,T$ for a large enough $T$. To the best of our knowledge, this is the first algorithm that obtains an optimal dependence on the horizon $H$ (and $S$)", "bibtex": "@InProceedings{pmlr-v162-tiapkin22a,\n title = \t {From {D}irichlet to Rubin: Optimistic Exploration in {RL} without Bonuses},\n author = {Tiapkin, Daniil and Belomestny, Denis and Moulines, Eric and Naumov, Alexey and Samsonov, Sergey and Tang, Yunhao and Valko, Michal and Menard, Pierre},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21380--21431},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tiapkin22a/tiapkin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tiapkin22a.html},\n abstract = \t {We propose the Bayes-UCBVI algorithm for reinforcement learning in tabular, stage-dependent, episodic Markov decision process: a natural extension of the Bayes-UCB algorithm by Kaufmann et al. 2012 for multi-armed bandits. Our method uses the quantile of a Q-value function posterior as upper confidence bound on the optimal Q-value function. For Bayes-UCBVI, we prove a regret bound of order $\\widetilde{\\mathcal{O}}(\\sqrt{H^3SAT})$ where $H$ is the length of one episode, $S$ is the number of states, $A$ the number of actions, $T$ the number of episodes, that matches the lower-bound of $\\Omega(\\sqrt{H^3SAT})$ up to poly-$\\log$ terms in $H,S,A,T$ for a large enough $T$. To the best of our knowledge, this is the first algorithm that obtains an optimal dependence on the horizon $H$ (and $S$)", "pdf": "https://proceedings.mlr.press/v162/tiapkin22a/tiapkin22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/tiapkin22a-supp.zip", "pdf_size": 1102455, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13718848598996156852&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "HSE University+Artificial Intelligence Research Institute; Duisburg-Essen University; \u00c9cole Polytechnique; HSE University; HSE University; DeepMind; DeepMind; Otto von Guericke University", "aff_domain": "hse.ru; ; ; ; ; ; ; ", "email": "hse.ru; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/tiapkin22a.html", "aff_unique_index": "0+1;2;3;0;0;4;4;5", "aff_unique_norm": "Higher School of Economics;Artificial Intelligence Research Institute;University of Duisburg-Essen;Ecole Polytechnique;DeepMind;Otto von Guericke University Magdeburg", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://hse.ru;;https://www.uni-due.de;https://www.polytechnique.edu;https://deepmind.com;https://www.ovgu.de", "aff_unique_abbr": "HSE;;UDE;X;DeepMind;OVGU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;2;3;0;0;4;4;2", "aff_country_unique": "Russian Federation;United States;Germany;France;United Kingdom" }, { "title": "From Noisy Prediction to True Label: Noisy Prediction Calibration via Generative Model", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18349", "id": "18349", "proceeding": "https://proceedings.mlr.press/v162/bae22a.html", "poster": "/media/PosterPDFs/ICML%202022/c54e7837e0cd0ced286cb5995327d1ab_Frbwbb7.png?t=1657450533.1361628", "slides": "/media/icml-2022/Slides/18349_XuN1nb6.pdf", "author_site": "HeeSun Bae, Seungjae Shin, Byeonghu Na, JoonHo Jang, Kyungwoo Song, IL CHUL MOON", "author": "Heesun Bae; Seungjae Shin; Byeonghu Na; Joonho Jang; Kyungwoo Song; Il-Chul Moon", "abstract": "Noisy labels are inevitable yet problematic in machine learning society. It ruins the generalization of a classifier by making the classifier over-fitted to noisy labels. Existing methods on noisy label have focused on modifying the classifier during the training procedure. It has two potential problems. First, these methods are not applicable to a pre-trained classifier without further access to training. Second, it is not easy to train a classifier and regularize all negative effects from noisy labels, simultaneously. We suggest a new branch of method, Noisy Prediction Calibration (NPC) in learning with noisy labels. Through the introduction and estimation of a new type of transition matrix via generative model, NPC corrects the noisy prediction from the pre-trained classifier to the true label as a post-processing scheme. We prove that NPC theoretically aligns with the transition matrix based methods. Yet, NPC empirically provides more accurate pathway to estimate true label, even without involvement in classifier learning. Also, NPC is applicable to any classifier trained with noisy label methods, if training instances and its predictions are available. Our method, NPC, boosts the classification performances of all baseline models on both synthetic and real-world datasets. The implemented code is available at https://github.com/BaeHeeSun/NPC.", "bibtex": "@InProceedings{pmlr-v162-bae22a,\n title = \t {From Noisy Prediction to True Label: Noisy Prediction Calibration via Generative Model},\n author = {Bae, Heesun and Shin, Seungjae and Na, Byeonghu and Jang, Joonho and Song, Kyungwoo and Moon, Il-Chul},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1277--1297},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bae22a/bae22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bae22a.html},\n abstract = \t {Noisy labels are inevitable yet problematic in machine learning society. It ruins the generalization of a classifier by making the classifier over-fitted to noisy labels. Existing methods on noisy label have focused on modifying the classifier during the training procedure. It has two potential problems. First, these methods are not applicable to a pre-trained classifier without further access to training. Second, it is not easy to train a classifier and regularize all negative effects from noisy labels, simultaneously. We suggest a new branch of method, Noisy Prediction Calibration (NPC) in learning with noisy labels. Through the introduction and estimation of a new type of transition matrix via generative model, NPC corrects the noisy prediction from the pre-trained classifier to the true label as a post-processing scheme. We prove that NPC theoretically aligns with the transition matrix based methods. Yet, NPC empirically provides more accurate pathway to estimate true label, even without involvement in classifier learning. Also, NPC is applicable to any classifier trained with noisy label methods, if training instances and its predictions are available. Our method, NPC, boosts the classification performances of all baseline models on both synthetic and real-world datasets. The implemented code is available at https://github.com/BaeHeeSun/NPC.}\n}", "pdf": "https://proceedings.mlr.press/v162/bae22a/bae22a.pdf", "supp": "", "pdf_size": 5256658, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8277956937717286777&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Industrial and Systems Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea; Industrial and Systems Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea; Industrial and Systems Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea; Industrial and Systems Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea; Department of AI, University of Seoul, Seoul, Republic of Korea; Industrial and Systems Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea + Summary.AI, Daejeon, Republic of Korea", "aff_domain": "kaist.ac.kr; ; ; ; ;kaist.ac.kr", "email": "kaist.ac.kr; ; ; ; ;kaist.ac.kr", "github": "https://github.com/BaeHeeSun/NPC", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/bae22a.html", "aff_unique_index": "0;0;0;0;1;0+2", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Seoul;summary.ai", "aff_unique_dep": "Industrial and Systems Engineering;Department of AI;", "aff_unique_url": "https://www.kaist.ac.kr;http://www.seoultech.ac.kr;", "aff_unique_abbr": "KAIST;UOS;", "aff_campus_unique_index": "0;0;0;0;1;0+0", "aff_campus_unique": "Daejeon;Seoul", "aff_country_unique_index": "0;0;0;0;0;0+0", "aff_country_unique": "South Korea" }, { "title": "From block-Toeplitz matrices to differential equations on graphs: towards a general theory for scalable masked Transformers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16581", "id": "16581", "proceeding": "https://proceedings.mlr.press/v162/choromanski22a.html", "poster": "/media/PosterPDFs/ICML%202022/f231f2107df69eab0a3862d50018a9b2_mzhGQSV.png?t=1656641467.5088875", "slides": "", "author_site": "Krzysztof Choromanski, Han Lin, Haoxian Chen, Tianyi Zhang, Arijit Sehanobish, Valerii Likhosherstov, Jack Parker-Holder, Tamas Sarlos, Adrian Weller, Thomas Weingarten", "author": "Krzysztof Choromanski; Han Lin; Haoxian Chen; Tianyi Zhang; Arijit Sehanobish; Valerii Likhosherstov; Jack Parker-Holder; Tamas Sarlos; Adrian Weller; Thomas Weingarten", "abstract": "In this paper we provide, to the best of our knowledge, the first comprehensive approach for incorporating various masking mechanisms into Transformers architectures in a scalable way. We show that recent results on linear causal attention (Choromanski et al., 2021) and log-linear RPE-attention (Luo et al., 2021) are special cases of this general mechanism. However by casting the problem as a topological (graph-based) modulation of unmasked attention, we obtain several results unknown before, including efficient d-dimensional RPE-masking and graph-kernel masking. We leverage many mathematical techniques ranging from spectral analysis through dynamic programming and random walks to new algorithms for solving Markov processes on graphs. We provide a corresponding empirical evaluation.", "bibtex": "@InProceedings{pmlr-v162-choromanski22a,\n title = \t {From block-Toeplitz matrices to differential equations on graphs: towards a general theory for scalable masked Transformers},\n author = {Choromanski, Krzysztof and Lin, Han and Chen, Haoxian and Zhang, Tianyi and Sehanobish, Arijit and Likhosherstov, Valerii and Parker-Holder, Jack and Sarlos, Tamas and Weller, Adrian and Weingarten, Thomas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3962--3983},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/choromanski22a/choromanski22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/choromanski22a.html},\n abstract = \t {In this paper we provide, to the best of our knowledge, the first comprehensive approach for incorporating various masking mechanisms into Transformers architectures in a scalable way. We show that recent results on linear causal attention (Choromanski et al., 2021) and log-linear RPE-attention (Luo et al., 2021) are special cases of this general mechanism. However by casting the problem as a topological (graph-based) modulation of unmasked attention, we obtain several results unknown before, including efficient d-dimensional RPE-masking and graph-kernel masking. We leverage many mathematical techniques ranging from spectral analysis through dynamic programming and random walks to new algorithms for solving Markov processes on graphs. We provide a corresponding empirical evaluation.}\n}", "pdf": "https://proceedings.mlr.press/v162/choromanski22a/choromanski22a.pdf", "supp": "", "pdf_size": 3401982, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1399080390715291897&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Google Brain Robotics+Columbia University; Columbia University; Columbia University; Columbia University; Independent Researcher; University of Cambridge; University of Oxford; Google Research; University of Cambridge+The Alan Turing Institute; Google", "aff_domain": "google.com; ; ; ; ; ; ; ; ; ", "email": "google.com; ; ; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 10, "oa": "https://proceedings.mlr.press/v162/choromanski22a.html", "aff_unique_index": "0+1;1;1;1;2;3;4;0;3+5;0", "aff_unique_norm": "Google;Columbia University;Independent Researcher;University of Cambridge;University of Oxford;Alan Turing Institute", "aff_unique_dep": "Google Brain Robotics;;;;;", "aff_unique_url": "https://ai.google;https://www.columbia.edu;;https://www.cam.ac.uk;https://www.ox.ac.uk;https://www.turing.ac.uk", "aff_unique_abbr": "Google Brain Robotics;Columbia;;Cambridge;Oxford;ATI", "aff_campus_unique_index": "0;2;0;2;0", "aff_campus_unique": "Mountain View;;Cambridge", "aff_country_unique_index": "0+0;0;0;0;2;2;0;2+2;0", "aff_country_unique": "United States;;United Kingdom" }, { "title": "From data to functa: Your data point is a function and you can treat it like one", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16413", "id": "16413", "proceeding": "https://proceedings.mlr.press/v162/dupont22a.html", "poster": "/media/PosterPDFs/ICML%202022/494ba9ff03bdad881378a6fd4092a6c7_Pl4PLZ0.png?t=1657616470.3715951", "slides": "", "author_site": "Emilien Dupont, Hyunjik Kim, S. M. Ali Eslami, Danilo J. Rezende, Dan Rosenbaum", "author": "Emilien Dupont; Hyunjik Kim; S. M. Ali Eslami; Danilo Jimenez Rezende; Dan Rosenbaum", "abstract": "It is common practice in deep learning to represent a measurement of the world on a discrete grid, e.g. a 2D grid of pixels. However, the underlying signal represented by these measurements is often continuous, e.g. the scene depicted in an image. A powerful continuous alternative is then to represent these measurements using an", "bibtex": "@InProceedings{pmlr-v162-dupont22a,\n title = \t {From data to functa: Your data point is a function and you can treat it like one},\n author = {Dupont, Emilien and Kim, Hyunjik and Eslami, S. M. Ali and Rezende, Danilo Jimenez and Rosenbaum, Dan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5694--5725},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dupont22a/dupont22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dupont22a.html},\n abstract = \t {It is common practice in deep learning to represent a measurement of the world on a discrete grid, e.g. a 2D grid of pixels. However, the underlying signal represented by these measurements is often continuous, e.g. the scene depicted in an image. A powerful continuous alternative is then to represent these measurements using an", "pdf": "https://proceedings.mlr.press/v162/dupont22a/dupont22a.pdf", "supp": "", "pdf_size": 15138292, "gs_citation": 222, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12032092835733488210&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of Oxford; DeepMind; DeepMind; DeepMind; University of Haifa", "aff_domain": "stats.ox.ac.uk;google.com; ; ; ", "email": "stats.ox.ac.uk;google.com; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/dupont22a.html", "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "University of Oxford;DeepMind;University of Haifa", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ox.ac.uk;https://deepmind.com;https://www.haifa.ac.il", "aff_unique_abbr": "Oxford;DeepMind;UoH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "United Kingdom;Israel" }, { "title": "Frustratingly Easy Transferability Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17385", "id": "17385", "proceeding": "https://proceedings.mlr.press/v162/huang22d.html", "poster": "/media/PosterPDFs/ICML%202022/71e09b16e21f7b6919bbfc43f6a5b2f0.png?t=1657621739.3587844", "slides": "", "author_site": "Long-Kai Huang, Junzhou Huang, Yu Rong, Qiang Yang, Ying WEI", "author": "Long-Kai Huang; Junzhou Huang; Yu Rong; Qiang Yang; Ying Wei", "abstract": "Transferability estimation has been an essential tool in selecting a pre-trained model and the layers in it for transfer learning, to transfer, so as to maximize the performance on a target task and prevent negative transfer. Existing estimation algorithms either require intensive training on target tasks or have difficulties in evaluating the transferability between layers. To this end, we propose a simple, efficient, and effective transferability measure named TransRate. Through a single pass over examples of a target task, TransRate measures the transferability as the mutual information between features of target examples extracted by a pre-trained model and their labels. We overcome the challenge of efficient mutual information estimation by resorting to coding rate that serves as an effective alternative to entropy. From the perspective of feature representation, the resulting TransRate evaluates both completeness (whether features contain sufficient information of a target task) and compactness (whether features of each class are compact enough for good generalization) of pre-trained features. Theoretically, we have analyzed the close connection of TransRate to the performance after transfer learning. Despite its extraordinary simplicity in 10 lines of codes, TransRate performs remarkably well in extensive evaluations on 35 pre-trained models and 16 downstream tasks.", "bibtex": "@InProceedings{pmlr-v162-huang22d,\n title = \t {Frustratingly Easy Transferability Estimation},\n author = {Huang, Long-Kai and Huang, Junzhou and Rong, Yu and Yang, Qiang and Wei, Ying},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9201--9225},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22d/huang22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22d.html},\n abstract = \t {Transferability estimation has been an essential tool in selecting a pre-trained model and the layers in it for transfer learning, to transfer, so as to maximize the performance on a target task and prevent negative transfer. Existing estimation algorithms either require intensive training on target tasks or have difficulties in evaluating the transferability between layers. To this end, we propose a simple, efficient, and effective transferability measure named TransRate. Through a single pass over examples of a target task, TransRate measures the transferability as the mutual information between features of target examples extracted by a pre-trained model and their labels. We overcome the challenge of efficient mutual information estimation by resorting to coding rate that serves as an effective alternative to entropy. From the perspective of feature representation, the resulting TransRate evaluates both completeness (whether features contain sufficient information of a target task) and compactness (whether features of each class are compact enough for good generalization) of pre-trained features. Theoretically, we have analyzed the close connection of TransRate to the performance after transfer learning. Despite its extraordinary simplicity in 10 lines of codes, TransRate performs remarkably well in extensive evaluations on 35 pre-trained models and 16 downstream tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22d/huang22d.pdf", "supp": "", "pdf_size": 4098883, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1187909625952692597&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Tencent AI Lab; Tencent AI Lab; Tencent AI Lab; Hong Kong University of Science and Technology; City University of Hong Kong", "aff_domain": "cityu.edu.hk; ; ; ;cityu.edu.hk", "email": "cityu.edu.hk; ; ; ;cityu.edu.hk", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/huang22d.html", "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Tencent;Hong Kong University of Science and Technology;City University of Hong Kong", "aff_unique_dep": "Tencent AI Lab;;", "aff_unique_url": "https://ai.tencent.com;https://www.ust.hk;https://www.cityu.edu.hk", "aff_unique_abbr": "Tencent AI Lab;HKUST;CityU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Fully-Connected Network on Noncompact Symmetric Space and Ridgelet Transform based on Helgason-Fourier Analysis", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17889", "id": "17889", "proceeding": "https://proceedings.mlr.press/v162/sonoda22a.html", "poster": "/media/PosterPDFs/ICML%202022/07c5807d0d927dcd0980f86024e5208b.png?t=1657162886.950799", "slides": "", "author_site": "Sho Sonoda, Isao Ishikawa, Masahiro Ikeda", "author": "Sho Sonoda; Isao Ishikawa; Masahiro Ikeda", "abstract": "Neural network on Riemannian symmetric space such as hyperbolic space and the manifold of symmetric positive definite (SPD) matrices is an emerging subject of research in geometric deep learning. Based on the well-established framework of the Helgason-Fourier transform on the noncompact symmetric space, we present a fully-connected network and its associated ridgelet transform on the noncompact symmetric space, covering the hyperbolic neural network (HNN) and the SPDNet as special cases. The ridgelet transform is an analysis operator of a depth-2 continuous network spanned by neurons, namely, it maps an arbitrary given function to the weights of a network. Thanks to the coordinate-free reformulation, the role of nonlinear activation functions is revealed to be a wavelet function. Moreover, the reconstruction formula is applied to present a constructive proof of the universality of finite networks on symmetric spaces.", "bibtex": "@InProceedings{pmlr-v162-sonoda22a,\n title = \t {Fully-Connected Network on Noncompact Symmetric Space and Ridgelet Transform based on Helgason-{F}ourier Analysis},\n author = {Sonoda, Sho and Ishikawa, Isao and Ikeda, Masahiro},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20405--20422},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sonoda22a/sonoda22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sonoda22a.html},\n abstract = \t {Neural network on Riemannian symmetric space such as hyperbolic space and the manifold of symmetric positive definite (SPD) matrices is an emerging subject of research in geometric deep learning. Based on the well-established framework of the Helgason-Fourier transform on the noncompact symmetric space, we present a fully-connected network and its associated ridgelet transform on the noncompact symmetric space, covering the hyperbolic neural network (HNN) and the SPDNet as special cases. The ridgelet transform is an analysis operator of a depth-2 continuous network spanned by neurons, namely, it maps an arbitrary given function to the weights of a network. Thanks to the coordinate-free reformulation, the role of nonlinear activation functions is revealed to be a wavelet function. Moreover, the reconstruction formula is applied to present a constructive proof of the universality of finite networks on symmetric spaces.}\n}", "pdf": "https://proceedings.mlr.press/v162/sonoda22a/sonoda22a.pdf", "supp": "", "pdf_size": 416543, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5288012352356888425&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "RIKEN Center for Advanced Intelligence Project (AIP), Tokyo, Japan; Ehime University, Ehime, Japan; RIKEN Center for Advanced Intelligence Project (AIP), Tokyo, Japan", "aff_domain": "riken.jp; ; ", "email": "riken.jp; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sonoda22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "RIKEN Center for Advanced Intelligence Project;Ehime University", "aff_unique_dep": "Advanced Intelligence Project;", "aff_unique_url": "https://aipcenter.riken.jp/en/;https://www.ehime-u.ac.jp", "aff_unique_abbr": "RIKEN AIP;Ehime U", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Tokyo;Ehime", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "title": "Function-space Inference with Sparse Implicit Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16073", "id": "16073", "proceeding": "https://proceedings.mlr.press/v162/rodri-guez-santana22a.html", "poster": "/media/PosterPDFs/ICML%202022/66fae5b05c0f64c4d2bdcdf1ad85f7b2_MaBPwFY.png?t=1657218483.5723155", "slides": "/media/icml-2022/Slides/16073.pdf", "author_site": "Simon R Santana, Bryan Zaldivar, Daniel Hernandez-Lobato", "author": "Simon Rodr\u0131\u0301guez-Santana; Bryan Zaldivar; Daniel Hernandez-Lobato", "abstract": "Implicit Processes (IPs) represent a flexible framework that can be used to describe a wide variety of models, from Bayesian neural networks, neural samplers and data generators to many others. IPs also allow for approximate inference in function-space. This change of formulation solves intrinsic degenerate problems of parameter-space approximate inference concerning the high number of parameters and their strong dependencies in large models. For this, previous works in the literature have attempted to employ IPs both to set up the prior and to approximate the resulting posterior. However, this has proven to be a challenging task. Existing methods that can tune the prior IP result in a Gaussian predictive distribution, which fails to capture important data patterns. By contrast, methods producing flexible predictive distributions by using another IP to approximate the posterior process cannot tune the prior IP to the observed data. We propose here the first method that can accomplish both goals. For this, we rely on an inducing-point representation of the prior IP, as often done in the context of sparse Gaussian processes. The result is a scalable method for approximate inference with IPs that can tune the prior IP parameters to the data, and that provides accurate non-Gaussian predictive distributions.", "bibtex": "@InProceedings{pmlr-v162-rodri-guez-santana22a,\n title = \t {Function-space Inference with Sparse Implicit Processes},\n author = {Rodr\\'{\\i}guez-Santana, Simon and Zaldivar, Bryan and Hernandez-Lobato, Daniel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18723--18740},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rodri-guez-santana22a/rodri-guez-santana22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rodri-guez-santana22a.html},\n abstract = \t {Implicit Processes (IPs) represent a flexible framework that can be used to describe a wide variety of models, from Bayesian neural networks, neural samplers and data generators to many others. IPs also allow for approximate inference in function-space. This change of formulation solves intrinsic degenerate problems of parameter-space approximate inference concerning the high number of parameters and their strong dependencies in large models. For this, previous works in the literature have attempted to employ IPs both to set up the prior and to approximate the resulting posterior. However, this has proven to be a challenging task. Existing methods that can tune the prior IP result in a Gaussian predictive distribution, which fails to capture important data patterns. By contrast, methods producing flexible predictive distributions by using another IP to approximate the posterior process cannot tune the prior IP to the observed data. We propose here the first method that can accomplish both goals. For this, we rely on an inducing-point representation of the prior IP, as often done in the context of sparse Gaussian processes. The result is a scalable method for approximate inference with IPs that can tune the prior IP parameters to the data, and that provides accurate non-Gaussian predictive distributions.}\n}", "pdf": "https://proceedings.mlr.press/v162/rodri-guez-santana22a/rodri-guez-santana22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/rodriguezsantana22a-supp.zip", "pdf_size": 10456171, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1977636553306413786&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Institute of Mathematical Sciences (ICMAT-CSIC), Madrid, Spain; Institute of Corpuscular Physics, University of Valencia and CSIC, Spain; Escuela Polit\u00e9cnica Superior, Universidad Aut\u00f3noma de Madrid, Spain", "aff_domain": "icmat.es; ; ", "email": "icmat.es; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/rodri-guez-santana22a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Institute of Mathematical Sciences;University of Valencia;Universidad Aut\u00f3noma de Madrid", "aff_unique_dep": "Mathematical Sciences;Institute of Corpuscular Physics;Escuela Polit\u00e9cnica Superior", "aff_unique_url": "https://www.icmat.es;https://www.uv.es;https://www.uam.es", "aff_unique_abbr": "ICMAT;UV;UAM", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madrid;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Spain" }, { "title": "Functional Generalized Empirical Likelihood Estimation for Conditional Moment Restrictions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17909", "id": "17909", "proceeding": "https://proceedings.mlr.press/v162/kremer22a.html", "poster": "/media/PosterPDFs/ICML%202022/6e3b0bf8b7d5956ae572b15cd7ddb0e1.png?t=1658071090.7959306", "slides": "", "author_site": "Heiner Kremer, Jia-Jie Zhu, Krikamol Muandet, Bernhard Sch\u00f6lkopf", "author": "Heiner Kremer; Jia-Jie Zhu; Krikamol Muandet; Bernhard Sch\u00f6lkopf", "abstract": "Important problems in causal inference, economics, and, more generally, robust machine learning can be expressed as conditional moment restrictions, but estimation becomes challenging as it requires solving a continuum of unconditional moment restrictions. Previous works addressed this problem by extending the generalized method of moments (GMM) to continuum moment restrictions. In contrast, generalized empirical likelihood (GEL) provides a more general framework and has been shown to enjoy favorable small-sample properties compared to GMM-based estimators. To benefit from recent developments in machine learning, we provide a functional reformulation of GEL in which arbitrary models can be leveraged. Motivated by a dual formulation of the resulting infinite dimensional optimization problem, we devise a practical method and explore its asymptotic properties. Finally, we provide kernel- and neural network-based implementations of the estimator, which achieve state-of-the-art empirical performance on two conditional moment restriction problems.", "bibtex": "@InProceedings{pmlr-v162-kremer22a,\n title = \t {Functional Generalized Empirical Likelihood Estimation for Conditional Moment Restrictions},\n author = {Kremer, Heiner and Zhu, Jia-Jie and Muandet, Krikamol and Sch{\\\"o}lkopf, Bernhard},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11665--11682},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kremer22a/kremer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kremer22a.html},\n abstract = \t {Important problems in causal inference, economics, and, more generally, robust machine learning can be expressed as conditional moment restrictions, but estimation becomes challenging as it requires solving a continuum of unconditional moment restrictions. Previous works addressed this problem by extending the generalized method of moments (GMM) to continuum moment restrictions. In contrast, generalized empirical likelihood (GEL) provides a more general framework and has been shown to enjoy favorable small-sample properties compared to GMM-based estimators. To benefit from recent developments in machine learning, we provide a functional reformulation of GEL in which arbitrary models can be leveraged. Motivated by a dual formulation of the resulting infinite dimensional optimization problem, we devise a practical method and explore its asymptotic properties. Finally, we provide kernel- and neural network-based implementations of the estimator, which achieve state-of-the-art empirical performance on two conditional moment restriction problems.}\n}", "pdf": "https://proceedings.mlr.press/v162/kremer22a/kremer22a.pdf", "supp": "", "pdf_size": 614503, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4926746325545340155&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Weierstrass Institute for Applied Analysis and Stochastics, Berlin, Germany; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", "aff_domain": "tuebingen.mpg.de; ; ; ", "email": "tuebingen.mpg.de; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/kremer22a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Weierstrass Institute for Applied Analysis and Stochastics", "aff_unique_dep": ";", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.wias-berlin.de/", "aff_unique_abbr": "MPI-IS;WIAS", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "T\u00fcbingen;Berlin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Functional Output Regression with Infimal Convolution: Exploring the Huber and $\u03b5$-insensitive Losses", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17427", "id": "17427", "proceeding": "https://proceedings.mlr.press/v162/lambert22a.html", "poster": "/media/PosterPDFs/ICML%202022/7f141cf8e7136ce8701dc6636c2a6fe4.png?t=1658414173.7576902", "slides": "/media/icml-2022/Slides/17427.pdf", "author_site": "Alex Lambert, Dimitri Bouche, Zoltan Szabo, Florence d'Alch\u00e9-Buc", "author": "Alex Lambert; Dimitri Bouche; Zoltan Szabo; Florence D\u2019Alch\u00e9-Buc", "abstract": "The focus of the paper is functional output regression (FOR) with convoluted losses. While most existing work consider the square loss setting, we leverage extensions of the Huber and the $\\epsilon$-insensitive loss (induced by infimal convolution) and propose a flexible framework capable of handling various forms of outliers and sparsity in the FOR family. We derive computationally tractable algorithms relying on duality to tackle the resulting tasks in the context of vector-valued reproducing kernel Hilbert spaces. The efficiency of the approach is demonstrated and contrasted with the classical squared loss setting on both synthetic and real-world benchmarks.", "bibtex": "@InProceedings{pmlr-v162-lambert22a,\n title = \t {Functional Output Regression with Infimal Convolution: Exploring the Huber and $\u03b5$-insensitive Losses},\n author = {Lambert, Alex and Bouche, Dimitri and Szabo, Zoltan and D'Alch{\\'e}-Buc, Florence},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11844--11867},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lambert22a/lambert22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lambert22a.html},\n abstract = \t {The focus of the paper is functional output regression (FOR) with convoluted losses. While most existing work consider the square loss setting, we leverage extensions of the Huber and the $\\epsilon$-insensitive loss (induced by infimal convolution) and propose a flexible framework capable of handling various forms of outliers and sparsity in the FOR family. We derive computationally tractable algorithms relying on duality to tackle the resulting tasks in the context of vector-valued reproducing kernel Hilbert spaces. The efficiency of the approach is demonstrated and contrasted with the classical squared loss setting on both synthetic and real-world benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/lambert22a/lambert22a.pdf", "supp": "", "pdf_size": 2336476, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13118582575057878063&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "LTCI, Telecom Paris, IP Paris, France+ESAT, KU Leuven, Belgium; LTCI, Telecom Paris, IP Paris, France; Department of Statistics, London School of Economics, United Kingdom; LTCI, Telecom Paris, IP Paris, France", "aff_domain": "kuleuven.be; ; ; ", "email": "kuleuven.be; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lambert22a.html", "aff_unique_index": "0+1;0;2;0", "aff_unique_norm": "Telecom Paris;KU Leuven;London School of Economics", "aff_unique_dep": "LTCI;ESAT;Department of Statistics", "aff_unique_url": "https://www.telecom-paris.fr;https://www.kuleuven.be;https://www.lse.ac.uk", "aff_unique_abbr": "Telecom Paris;;LSE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;2;0", "aff_country_unique": "France;Belgium;United Kingdom" }, { "title": "G$^2$CN: Graph Gaussian Convolution Networks with Concentrated Graph Filters", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17629", "id": "17629", "proceeding": "https://proceedings.mlr.press/v162/li22h.html", "poster": "/media/PosterPDFs/ICML%202022/88ef51f0bf911e452e8dbb1d807a81ab.png?t=1658057526.3954146", "slides": "", "author_site": "Mingjie Li, Xiaojun Guo, Yifei Wang, Yisen Wang, Zhouchen Lin", "author": "Mingjie Li; Xiaojun Guo; Yifei Wang; Yisen Wang; Zhouchen Lin", "abstract": "Recently, linear GCNs have shown competitive performance against non-linear ones with less computation cost, and the key lies in their propagation layers. Spectral analysis has been widely adopted in designing and analyzing existing graph propagations. Nevertheless, we notice that existing spectral analysis fails to explain why existing graph propagations with the same global tendency, such as low-pass or high-pass, still yield very different results. Motivated by this situation, we develop a new framework for spectral analysis in this paper called concentration analysis. In particular, we propose three attributes: concentration centre, maximum response, and bandwidth for our analysis. Through a dissection of the limitations of existing graph propagations via the above analysis, we propose a new kind of propagation layer, Graph Gaussian Convolution Networks (G^2CN), in which the three properties are decoupled and the whole structure becomes more flexible and applicable to different kinds of graphs. Extensive experiments show that we can obtain state-of-the-art performance on heterophily and homophily datasets with our proposed G^2CN.", "bibtex": "@InProceedings{pmlr-v162-li22h,\n title = \t {{G}$^2${CN}: Graph {G}aussian Convolution Networks with Concentrated Graph Filters},\n author = {Li, Mingjie and Guo, Xiaojun and Wang, Yifei and Wang, Yisen and Lin, Zhouchen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12782--12796},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22h/li22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22h.html},\n abstract = \t {Recently, linear GCNs have shown competitive performance against non-linear ones with less computation cost, and the key lies in their propagation layers. Spectral analysis has been widely adopted in designing and analyzing existing graph propagations. Nevertheless, we notice that existing spectral analysis fails to explain why existing graph propagations with the same global tendency, such as low-pass or high-pass, still yield very different results. Motivated by this situation, we develop a new framework for spectral analysis in this paper called concentration analysis. In particular, we propose three attributes: concentration centre, maximum response, and bandwidth for our analysis. Through a dissection of the limitations of existing graph propagations via the above analysis, we propose a new kind of propagation layer, Graph Gaussian Convolution Networks (G^2CN), in which the three properties are decoupled and the whole structure becomes more flexible and applicable to different kinds of graphs. Extensive experiments show that we can obtain state-of-the-art performance on heterophily and homophily datasets with our proposed G^2CN.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22h/li22h.pdf", "supp": "", "pdf_size": 466541, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9435672911110832415&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Key Lab. of Machine Perception (MoE), School of Artificial Intelligence, Peking University; Key Lab. of Machine Perception (MoE), School of Artificial Intelligence, Peking University; School of Mathematical Sciences, Peking University; Institute for Artificial Intelligence, Peking University; Institute for Artificial Intelligence, Peking University + Pazhou Lab, Guangzhou, China", "aff_domain": "pku.edu.cn; ; ; ;pku.edu.cn", "email": "pku.edu.cn; ; ; ;pku.edu.cn", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/li22h.html", "aff_unique_index": "0;0;0;0;0+1", "aff_unique_norm": "Peking University;Pazhou Lab", "aff_unique_dep": "School of Artificial Intelligence;", "aff_unique_url": "http://www.pku.edu.cn;", "aff_unique_abbr": "PKU;", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Beijing;Guangzhou", "aff_country_unique_index": "0;0;0;0;0+0", "aff_country_unique": "China" }, { "title": "G-Mixup: Graph Data Augmentation for Graph Classification", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16665", "id": "16665", "proceeding": "https://proceedings.mlr.press/v162/han22c.html", "poster": "", "slides": "", "author_site": "Xiaotian Han, Zhimeng Jiang, Ninghao Liu, Xia Hu", "author": "Xiaotian Han; Zhimeng Jiang; Ninghao Liu; Xia Hu", "abstract": "This work develops mixup for graph data. Mixup has shown superiority in improving the generalization and robustness of neural networks by interpolating features and labels between two random samples. Traditionally, Mixup can work on regular, grid-like, and Euclidean data such as image or tabular data. However, it is challenging to directly adopt Mixup to augment graph data because different graphs typically: 1) have different numbers of nodes; 2) are not readily aligned; and 3) have unique typologies in non-Euclidean space. To this end, we propose G-Mixup to augment graphs for graph classification by interpolating the generator (i.e., graphon) of different classes of graphs. Specifically, we first use graphs within the same class to estimate a graphon. Then, instead of directly manipulating graphs, we interpolate graphons of different classes in the Euclidean space to get mixed graphons, where the synthetic graphs are generated through sampling based on the mixed graphons. Extensive experiments show that G-Mixup substantially improves the generalization and robustness of GNNs.", "bibtex": "@InProceedings{pmlr-v162-han22c,\n title = \t {G-Mixup: Graph Data Augmentation for Graph Classification},\n author = {Han, Xiaotian and Jiang, Zhimeng and Liu, Ninghao and Hu, Xia},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8230--8248},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/han22c/han22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/han22c.html},\n abstract = \t {This work develops mixup for graph data. Mixup has shown superiority in improving the generalization and robustness of neural networks by interpolating features and labels between two random samples. Traditionally, Mixup can work on regular, grid-like, and Euclidean data such as image or tabular data. However, it is challenging to directly adopt Mixup to augment graph data because different graphs typically: 1) have different numbers of nodes; 2) are not readily aligned; and 3) have unique typologies in non-Euclidean space. To this end, we propose G-Mixup to augment graphs for graph classification by interpolating the generator (i.e., graphon) of different classes of graphs. Specifically, we first use graphs within the same class to estimate a graphon. Then, instead of directly manipulating graphs, we interpolate graphons of different classes in the Euclidean space to get mixed graphons, where the synthetic graphs are generated through sampling based on the mixed graphons. Extensive experiments show that G-Mixup substantially improves the generalization and robustness of GNNs.}\n}", "pdf": "https://proceedings.mlr.press/v162/han22c/han22c.pdf", "supp": "", "pdf_size": 2893309, "gs_citation": 255, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16313647976714156830&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science&Engineering, Texas A&M University; Department of Computer Science&Engineering, Texas A&M University; Department of Computer Science, University of Georgia; Department of Computer Science, Rice University", "aff_domain": "tamu.edu; ; ; ", "email": "tamu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/han22c.html", "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Texas A&M University;University of Georgia;Rice University", "aff_unique_dep": "Department of Computer Science and Engineering;Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.tamu.edu;https://www.uga.edu;https://www.rice.edu", "aff_unique_abbr": "TAMU;UGA;Rice", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "GACT: Activation Compressed Training for Generic Network Architectures", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17189", "id": "17189", "proceeding": "https://proceedings.mlr.press/v162/liu22v.html", "poster": "/media/PosterPDFs/ICML%202022/f8da71e562ff44a2bc7edf3578c593da.png?t=1657867417.2075663", "slides": "/media/icml-2022/Slides/17189.pdf", "author_site": "Xiaoxuan Liu, Lianmin Zheng, Dequan Wang, Yukuo Cen, Weize Chen, Xu Han, Jianfei Chen, Zhiyuan Liu, Jie Tang, Joseph Gonzalez, Michael Mahoney, Alvin Cheung", "author": "Xiaoxuan Liu; Lianmin Zheng; Dequan Wang; Yukuo Cen; Weize Chen; Xu Han; Jianfei Chen; Zhiyuan Liu; Jie Tang; Joey Gonzalez; Michael Mahoney; Alvin Cheung", "abstract": "Training large neural network (NN) models requires extensive memory resources, and Activation Compression Training (ACT) is a promising approach to reduce training memory footprint. This paper presents GACT, an ACT framework to support a broad range of machine learning tasks for generic NN architectures with limited domain knowledge. By analyzing a linearized version of ACT\u2019s approximate gradient, we prove the convergence of GACT without prior knowledge on operator type or model architecture. To make training stable, we propose an algorithm that decides the compression ratio for each tensor by estimating its impact on the gradient at run time. We implement GACT as a PyTorch library that readily applies to any NN architecture. GACT reduces the activation memory for convolutional NNs, transformers, and graph NNs by up to 8.1x, enabling training with a 4.2x to 24.7x larger batch size, with negligible accuracy loss.", "bibtex": "@InProceedings{pmlr-v162-liu22v,\n title = \t {{GACT}: Activation Compressed Training for Generic Network Architectures},\n author = {Liu, Xiaoxuan and Zheng, Lianmin and Wang, Dequan and Cen, Yukuo and Chen, Weize and Han, Xu and Chen, Jianfei and Liu, Zhiyuan and Tang, Jie and Gonzalez, Joey and Mahoney, Michael and Cheung, Alvin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14139--14152},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22v/liu22v.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22v.html},\n abstract = \t {Training large neural network (NN) models requires extensive memory resources, and Activation Compression Training (ACT) is a promising approach to reduce training memory footprint. This paper presents GACT, an ACT framework to support a broad range of machine learning tasks for generic NN architectures with limited domain knowledge. By analyzing a linearized version of ACT\u2019s approximate gradient, we prove the convergence of GACT without prior knowledge on operator type or model architecture. To make training stable, we propose an algorithm that decides the compression ratio for each tensor by estimating its impact on the gradient at run time. We implement GACT as a PyTorch library that readily applies to any NN architecture. GACT reduces the activation memory for convolutional NNs, transformers, and graph NNs by up to 8.1x, enabling training with a 4.2x to 24.7x larger batch size, with negligible accuracy loss.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22v/liu22v.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/liu22v-supp.zip", "pdf_size": 926298, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12961558979640169971&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": ";;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;", "email": ";;;;;;;;;;;", "github": "", "project": "", "author_num": 12, "oa": "https://proceedings.mlr.press/v162/liu22v.html" }, { "title": "GALAXY: Graph-based Active Learning at the Extreme", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16535", "id": "16535", "proceeding": "https://proceedings.mlr.press/v162/zhang22k.html", "poster": "/media/PosterPDFs/ICML%202022/cc0991344c3d760ae42259064406bae1.png?t=1658014077.3687956", "slides": "", "author_site": "Jifan Zhang, Julian Katz-Samuels, Robert Nowak", "author": "Jifan Zhang; Julian Katz-Samuels; Robert Nowak", "abstract": "Active learning is a label-efficient approach to train highly effective models while interactively selecting only small subsets of unlabelled data for labelling and training. In \u201copen world\" settings, the classes of interest can make up a small fraction of the overall dataset \u2013 most of the data may be viewed as an out-of-distribution or irrelevant class. This leads to extreme class-imbalance, and our theory and methods focus on this core issue. We propose a new strategy for active learning called GALAXY (Graph-based Active Learning At the eXtrEme), which blends ideas from graph-based active learning and deep learning. GALAXY automatically and adaptively selects more class-balanced examples for labeling than most other methods for active learning. Our theory shows that GALAXY performs a refined form of uncertainty sampling that gathers a much more class-balanced dataset than vanilla uncertainty sampling. Experimentally, we demonstrate GALAXY\u2019s superiority over existing state-of-art deep active learning algorithms in unbalanced vision classification settings generated from popular datasets.", "bibtex": "@InProceedings{pmlr-v162-zhang22k,\n title = \t {{GALAXY}: Graph-based Active Learning at the Extreme},\n author = {Zhang, Jifan and Katz-Samuels, Julian and Nowak, Robert},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26223--26238},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22k/zhang22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22k.html},\n abstract = \t {Active learning is a label-efficient approach to train highly effective models while interactively selecting only small subsets of unlabelled data for labelling and training. In \u201copen world\" settings, the classes of interest can make up a small fraction of the overall dataset \u2013 most of the data may be viewed as an out-of-distribution or irrelevant class. This leads to extreme class-imbalance, and our theory and methods focus on this core issue. We propose a new strategy for active learning called GALAXY (Graph-based Active Learning At the eXtrEme), which blends ideas from graph-based active learning and deep learning. GALAXY automatically and adaptively selects more class-balanced examples for labeling than most other methods for active learning. Our theory shows that GALAXY performs a refined form of uncertainty sampling that gathers a much more class-balanced dataset than vanilla uncertainty sampling. Experimentally, we demonstrate GALAXY\u2019s superiority over existing state-of-art deep active learning algorithms in unbalanced vision classification settings generated from popular datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22k/zhang22k.pdf", "supp": "", "pdf_size": 1165294, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10022632741658948627&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "University of Wisconsin, Madison, USA; University of Wisconsin, Madison, USA; University of Wisconsin, Madison, USA", "aff_domain": "cs.wisc.edu; ; ", "email": "cs.wisc.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhang22k.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16339", "id": "16339", "proceeding": "https://proceedings.mlr.press/v162/nichol22a.html", "poster": "/media/PosterPDFs/ICML%202022/12ced2db6f0193dda91ba86224ea1cd8_mKpNLkb.png?t=1657759196.2550344", "slides": "", "author_site": "Alexander Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob McGrew, Ilya Sutskever, Mark Chen", "author": "Alexander Quinn Nichol; Prafulla Dhariwal; Aditya Ramesh; Pranav Shyam; Pamela Mishkin; Bob Mcgrew; Ilya Sutskever; Mark Chen", "abstract": "Diffusion models have recently been shown to generate high-quality synthetic images, especially when paired with a guidance technique to trade off diversity for fidelity. We explore diffusion models for the problem of text-conditional image synthesis and compare two different guidance strategies: CLIP guidance and classifier-free guidance. We find that the latter is preferred by human evaluators for both photorealism and caption similarity, and often produces photorealistic samples. Samples from a 3.5\u00a0billion parameter text-conditional diffusion model using classifier-free guidance are favored by human evaluators to those from DALL-E, even when the latter uses expensive CLIP reranking. Additionally, we find that our models can be fine-tuned to perform image inpainting, enabling powerful text-driven image editing. We train a smaller model on a filtered dataset and release the code and weights at https://github.com/openai/glide-text2im.", "bibtex": "@InProceedings{pmlr-v162-nichol22a,\n title = \t {{GLIDE}: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models},\n author = {Nichol, Alexander Quinn and Dhariwal, Prafulla and Ramesh, Aditya and Shyam, Pranav and Mishkin, Pamela and Mcgrew, Bob and Sutskever, Ilya and Chen, Mark},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16784--16804},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nichol22a/nichol22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nichol22a.html},\n abstract = \t {Diffusion models have recently been shown to generate high-quality synthetic images, especially when paired with a guidance technique to trade off diversity for fidelity. We explore diffusion models for the problem of text-conditional image synthesis and compare two different guidance strategies: CLIP guidance and classifier-free guidance. We find that the latter is preferred by human evaluators for both photorealism and caption similarity, and often produces photorealistic samples. Samples from a 3.5\u00a0billion parameter text-conditional diffusion model using classifier-free guidance are favored by human evaluators to those from DALL-E, even when the latter uses expensive CLIP reranking. Additionally, we find that our models can be fine-tuned to perform image inpainting, enabling powerful text-driven image editing. We train a smaller model on a filtered dataset and release the code and weights at https://github.com/openai/glide-text2im.}\n}", "pdf": "https://proceedings.mlr.press/v162/nichol22a/nichol22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/nichol22a-supp.zip", "pdf_size": 20974692, "gs_citation": 3944, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15472303808406531445&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "OpenAI; OpenAI; OpenAI; OpenAI; OpenAI; OpenAI; OpenAI; OpenAI", "aff_domain": "openai.com;openai.com;openai.com; ; ; ; ; ", "email": "openai.com;openai.com;openai.com; ; ; ; ; ", "github": "https://github.com/openai/glide-text2im", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/nichol22a.html", "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "OpenAI", "aff_unique_dep": "", "aff_unique_url": "https://openai.com", "aff_unique_abbr": "OpenAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "GLaM: Efficient Scaling of Language Models with Mixture-of-Experts", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17377", "id": "17377", "proceeding": "https://proceedings.mlr.press/v162/du22c.html", "poster": "/media/PosterPDFs/ICML%202022/1e5186bca8f75fca53960e8cb4a3b973.png?t=1657518545.8019478", "slides": "", "author_site": "Nan Du, Yanping Huang, Andrew Dai, Simon Tong, Dmitry Lepikhin, Yuanzhong Xu, Maxim Krikun, Yanqi Zhou, Adams Wei Yu, Orhan Firat, Barret Zoph, William Fedus, Maarten Bosma, Zongwei Zhou, Tao Wang, Emma Wang, Kellie Webster, Marie Pellat, Kevin Robinson, Kathleen Meier-Hellstern, Toju Duke, Lucas Dixon, Kun Zhang, Quoc Le, Yonghui Wu, Zhifeng Chen, Claire Cui", "author": "Nan Du; Yanping Huang; Andrew M Dai; Simon Tong; Dmitry Lepikhin; Yuanzhong Xu; Maxim Krikun; Yanqi Zhou; Adams Wei Yu; Orhan Firat; Barret Zoph; Liam Fedus; Maarten P Bosma; Zongwei Zhou; Tao Wang; Emma Wang; Kellie Webster; Marie Pellat; Kevin Robinson; Kathleen Meier-Hellstern; Toju Duke; Lucas Dixon; Kun Zhang; Quoc Le; Yonghui Wu; Zhifeng Chen; Claire Cui", "abstract": "Scaling language models with more data, compute and parameters has driven significant progress in natural language processing. For example, thanks to scaling, GPT-3 was able to achieve strong results on in-context learning tasks. However, training these large dense models requires significant amounts of computing resources. In this paper, we propose and develop a family of language models named \\glam (\\textbf{G}eneralist \\textbf{La}nguage \\textbf{M}odel), which uses a sparsely activated mixture-of-experts architecture to scale the model capacity while also incurring substantially less training cost compared to dense variants. The largest \\glam has 1.2 trillion parameters, which is approximately 7x larger than GPT-3. It consumes only 1/3 of the energy used to train GPT-3 and requires half of the computation flops for inference, while still achieving better overall fewshot performance across 29 NLP tasks.", "bibtex": "@InProceedings{pmlr-v162-du22c,\n title = \t {{GL}a{M}: Efficient Scaling of Language Models with Mixture-of-Experts},\n author = {Du, Nan and Huang, Yanping and Dai, Andrew M and Tong, Simon and Lepikhin, Dmitry and Xu, Yuanzhong and Krikun, Maxim and Zhou, Yanqi and Yu, Adams Wei and Firat, Orhan and Zoph, Barret and Fedus, Liam and Bosma, Maarten P and Zhou, Zongwei and Wang, Tao and Wang, Emma and Webster, Kellie and Pellat, Marie and Robinson, Kevin and Meier-Hellstern, Kathleen and Duke, Toju and Dixon, Lucas and Zhang, Kun and Le, Quoc and Wu, Yonghui and Chen, Zhifeng and Cui, Claire},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5547--5569},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/du22c/du22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/du22c.html},\n abstract = \t {Scaling language models with more data, compute and parameters has driven significant progress in natural language processing. For example, thanks to scaling, GPT-3 was able to achieve strong results on in-context learning tasks. However, training these large dense models requires significant amounts of computing resources. In this paper, we propose and develop a family of language models named \\glam (\\textbf{G}eneralist \\textbf{La}nguage \\textbf{M}odel), which uses a sparsely activated mixture-of-experts architecture to scale the model capacity while also incurring substantially less training cost compared to dense variants. The largest \\glam has 1.2 trillion parameters, which is approximately 7x larger than GPT-3. It consumes only 1/3 of the energy used to train GPT-3 and requires half of the computation flops for inference, while still achieving better overall fewshot performance across 29 NLP tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/du22c/du22c.pdf", "supp": "", "pdf_size": 786104, "gs_citation": 765, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2314975218365145566&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";;;;;;;;;;;;;;;;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;;;;;;;;;;;;;;;", "email": ";;;;;;;;;;;;;;;;;;;;;;;;;;", "github": "", "project": "", "author_num": 27, "oa": "https://proceedings.mlr.press/v162/du22c.html" }, { "title": "GNNRank: Learning Global Rankings from Pairwise Comparisons via Directed Graph Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18121", "id": "18121", "proceeding": "https://proceedings.mlr.press/v162/he22b.html", "poster": "/media/PosterPDFs/ICML%202022/160c88652d47d0be60bfbfed25111412_Kmcg75K.png?t=1655329553.1916773", "slides": "/media/icml-2022/Slides/18121_2ZdoEBe.pdf", "author_site": "Yixuan He, Quan Gan, David Wipf, Gesine Reinert, Junchi Yan, Mihai Cucuringu", "author": "Yixuan He; Quan Gan; David Wipf; Gesine D Reinert; Junchi Yan; Mihai Cucuringu", "abstract": "Recovering global rankings from pairwise comparisons has wide applications from time synchronization to sports team ranking. Pairwise comparisons corresponding to matches in a competition can be construed as edges in a directed graph (digraph), whose nodes represent e.g. competitors with an unknown rank. In this paper, we introduce neural networks into the ranking recovery problem by proposing the so-called GNNRank, a trainable GNN-based framework with digraph embedding. Moreover, new objectives are devised to encode ranking upsets/violations. The framework involves a ranking score estimation approach, and adds an inductive bias by unfolding the Fiedler vector computation of the graph constructed from a learnable similarity matrix. Experimental results on extensive data sets show that our methods attain competitive and often superior performance against baselines, as well as showing promising transfer ability. Codes and preprocessed data are at: \\url{https://github.com/SherylHYX/GNNRank}.", "bibtex": "@InProceedings{pmlr-v162-he22b,\n title = \t {{GNNR}ank: Learning Global Rankings from Pairwise Comparisons via Directed Graph Neural Networks},\n author = {He, Yixuan and Gan, Quan and Wipf, David and Reinert, Gesine D and Yan, Junchi and Cucuringu, Mihai},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8581--8612},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/he22b/he22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/he22b.html},\n abstract = \t {Recovering global rankings from pairwise comparisons has wide applications from time synchronization to sports team ranking. Pairwise comparisons corresponding to matches in a competition can be construed as edges in a directed graph (digraph), whose nodes represent e.g. competitors with an unknown rank. In this paper, we introduce neural networks into the ranking recovery problem by proposing the so-called GNNRank, a trainable GNN-based framework with digraph embedding. Moreover, new objectives are devised to encode ranking upsets/violations. The framework involves a ranking score estimation approach, and adds an inductive bias by unfolding the Fiedler vector computation of the graph constructed from a learnable similarity matrix. Experimental results on extensive data sets show that our methods attain competitive and often superior performance against baselines, as well as showing promising transfer ability. Codes and preprocessed data are at: \\url{https://github.com/SherylHYX/GNNRank}.}\n}", "pdf": "https://proceedings.mlr.press/v162/he22b/he22b.pdf", "supp": "", "pdf_size": 670598, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4446473441491315248&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Statistics, University of Oxford, Oxford, United Kingdom+The Alan Turing Institute, London, United Kingdom; Amazon Web Services AI Shanghai Lablet, Shanghai, China; Amazon Web Services AI Shanghai Lablet, Shanghai, China; Department of Statistics, University of Oxford, Oxford, United Kingdom+The Alan Turing Institute, London, United Kingdom; Department of Computer Science and Engineering and MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University, Shanghai, China+Shanghai AI Laboratory, Shanghai, China; Department of Statistics, University of Oxford, Oxford, United Kingdom+The Alan Turing Institute, London, United Kingdom", "aff_domain": "stats.ox.ac.uk;amazon.com;amazon.com;stats.ox.ac.uk;sjtu.edu.cn;stats.ox.ac.uk", "email": "stats.ox.ac.uk;amazon.com;amazon.com;stats.ox.ac.uk;sjtu.edu.cn;stats.ox.ac.uk", "github": "https://github.com/SherylHYX/GNNRank", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/he22b.html", "aff_unique_index": "0+1;2;2;0+1;3+4;0+1", "aff_unique_norm": "University of Oxford;Alan Turing Institute;Amazon;Shanghai Jiao Tong University;Shanghai AI Laboratory", "aff_unique_dep": "Department of Statistics;;AI Shanghai Lablet;Department of Computer Science and Engineering;", "aff_unique_url": "https://www.ox.ac.uk;https://www.turing.ac.uk;https://aws.amazon.com;https://www.sjtu.edu.cn;https://www.shanghaiailab.com", "aff_unique_abbr": "Oxford;ATI;AWS;SJTU;SAIL", "aff_campus_unique_index": "0+1;2;2;0+1;2+2;0+1", "aff_campus_unique": "Oxford;London;Shanghai", "aff_country_unique_index": "0+0;1;1;0+0;1+1;0+0", "aff_country_unique": "United Kingdom;China" }, { "title": "GSmooth: Certified Robustness against Semantic Transformations via Generalized Randomized Smoothing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17371", "id": "17371", "proceeding": "https://proceedings.mlr.press/v162/hao22c.html", "poster": "/media/PosterPDFs/ICML%202022/142536b9b535b78e681c11b0195d962f.png?t=1657519540.2210944", "slides": "/media/icml-2022/Slides/17371.pdf", "author_site": "Zhongkai Hao, Chengyang Ying, Yinpeng Dong, Hang Su, Jian Song, Jun Zhu", "author": "Zhongkai Hao; Chengyang Ying; Yinpeng Dong; Hang Su; Jian Song; Jun Zhu", "abstract": "Certified defenses such as randomized smoothing have shown promise towards building reliable machine learning systems against $\\ell_p$ norm bounded attacks. However, existing methods are insufficient or unable to provably defend against semantic transformations, especially those without closed-form expressions (such as defocus blur and pixelate), which are more common in practice and often unrestricted. To fill up this gap, we propose generalized randomized smoothing (GSmooth), a unified theoretical framework for certifying robustness against general semantic transformations via a novel dimension augmentation strategy. Under the GSmooth framework, we present a scalable algorithm that uses a surrogate image-to-image network to approximate the complex transformation. The surrogate model provides a powerful tool for studying the properties of semantic transformations and certifying robustness. Experimental results on several datasets demonstrate the effectiveness of our approach for robustness certification against multiple kinds of semantic transformations and corruptions, which is not achievable by the alternative baselines.", "bibtex": "@InProceedings{pmlr-v162-hao22c,\n title = \t {{GS}mooth: Certified Robustness against Semantic Transformations via Generalized Randomized Smoothing},\n author = {Hao, Zhongkai and Ying, Chengyang and Dong, Yinpeng and Su, Hang and Song, Jian and Zhu, Jun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8465--8483},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hao22c/hao22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/hao22c.html},\n abstract = \t {Certified defenses such as randomized smoothing have shown promise towards building reliable machine learning systems against $\\ell_p$ norm bounded attacks. However, existing methods are insufficient or unable to provably defend against semantic transformations, especially those without closed-form expressions (such as defocus blur and pixelate), which are more common in practice and often unrestricted. To fill up this gap, we propose generalized randomized smoothing (GSmooth), a unified theoretical framework for certifying robustness against general semantic transformations via a novel dimension augmentation strategy. Under the GSmooth framework, we present a scalable algorithm that uses a surrogate image-to-image network to approximate the complex transformation. The surrogate model provides a powerful tool for studying the properties of semantic transformations and certifying robustness. Experimental results on several datasets demonstrate the effectiveness of our approach for robustness certification against multiple kinds of semantic transformations and corruptions, which is not achievable by the alternative baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/hao22c/hao22c.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/hao22c-supp.zip", "pdf_size": 1152233, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1944882134693943289&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/hao22c.html" }, { "title": "Gating Dropout: Communication-efficient Regularization for Sparsely Activated Transformers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17415", "id": "17415", "proceeding": "https://proceedings.mlr.press/v162/liu22g.html", "poster": "", "slides": "", "author_site": "Rui Liu, Young Jin Kim, Alexandre Muzio, Hany Hassan", "author": "Rui Liu; Young Jin Kim; Alexandre Muzio; Hany Hassan", "abstract": "Sparsely activated transformers, such as Mixture of Experts (MoE), have received great interest due to their outrageous scaling capability which enables dramatical increases in model size without significant increases in computational cost. To achieve this, MoE models replace the feedforward sub-layer with Mixture-of-Experts sub-layer in transformers and use a gating network to route each token to its assigned experts. Since the common practice for efficient training of such models requires distributing experts and tokens across different machines, this routing strategy often incurs huge cross-machine communication cost because tokens and their assigned experts likely reside in different machines. In this paper, we propose", "bibtex": "@InProceedings{pmlr-v162-liu22g,\n title = \t {Gating Dropout: Communication-efficient Regularization for Sparsely Activated Transformers},\n author = {Liu, Rui and Kim, Young Jin and Muzio, Alexandre and Hassan, Hany},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13782--13792},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22g/liu22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22g.html},\n abstract = \t {Sparsely activated transformers, such as Mixture of Experts (MoE), have received great interest due to their outrageous scaling capability which enables dramatical increases in model size without significant increases in computational cost. To achieve this, MoE models replace the feedforward sub-layer with Mixture-of-Experts sub-layer in transformers and use a gating network to route each token to its assigned experts. Since the common practice for efficient training of such models requires distributing experts and tokens across different machines, this routing strategy often incurs huge cross-machine communication cost because tokens and their assigned experts likely reside in different machines. In this paper, we propose", "pdf": "https://proceedings.mlr.press/v162/liu22g/liu22g.pdf", "supp": "", "pdf_size": 527715, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14335741941990626986&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of Michigan, Ann Arbor; Microsoft; Microsoft; Microsoft", "aff_domain": "umich.edu;microsoft.com; ; ", "email": "umich.edu;microsoft.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/liu22g.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Michigan;Microsoft", "aff_unique_dep": ";Microsoft Corporation", "aff_unique_url": "https://www.umich.edu;https://www.microsoft.com", "aff_unique_abbr": "UM;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Gaussian Mixture Variational Autoencoder with Contrastive Learning for Multi-Label Classification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16865", "id": "16865", "proceeding": "https://proceedings.mlr.press/v162/bai22c.html", "poster": "/media/PosterPDFs/ICML%202022/dfd786998e082758be12670d856df755_2t1Rf8U.png?t=1657206277.672126", "slides": "", "author_site": "Junwen Bai, Shufeng Kong, Carla Gomes", "author": "Junwen Bai; Shufeng Kong; Carla P Gomes", "abstract": "Multi-label classification (MLC) is a prediction task where each sample can have more than one label. We propose a novel contrastive learning boosted multi-label prediction model based on a Gaussian mixture variational autoencoder (C-GMVAE), which learns a multimodal prior space and employs a contrastive loss. Many existing methods introduce extra complex neural modules like graph neural networks to capture the label correlations, in addition to the prediction modules. We find that by using contrastive learning in the supervised setting, we can exploit label information effectively in a data-driven manner, and learn meaningful feature and label embeddings which capture the label correlations and enhance the predictive power. Our method also adopts the idea of learning and aligning latent spaces for both features and labels. In contrast to previous works based on a unimodal prior, C-GMVAE imposes a Gaussian mixture structure on the latent space, to alleviate the posterior collapse and over-regularization issues. C-GMVAE outperforms existing methods on multiple public datasets and can often match other models\u2019 full performance with only 50% of the training data. Furthermore, we show that the learnt embeddings provide insights into the interpretation of label-label interactions.", "bibtex": "@InProceedings{pmlr-v162-bai22c,\n title = \t {{G}aussian Mixture Variational Autoencoder with Contrastive Learning for Multi-Label Classification},\n author = {Bai, Junwen and Kong, Shufeng and Gomes, Carla P},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1383--1398},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bai22c/bai22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/bai22c.html},\n abstract = \t {Multi-label classification (MLC) is a prediction task where each sample can have more than one label. We propose a novel contrastive learning boosted multi-label prediction model based on a Gaussian mixture variational autoencoder (C-GMVAE), which learns a multimodal prior space and employs a contrastive loss. Many existing methods introduce extra complex neural modules like graph neural networks to capture the label correlations, in addition to the prediction modules. We find that by using contrastive learning in the supervised setting, we can exploit label information effectively in a data-driven manner, and learn meaningful feature and label embeddings which capture the label correlations and enhance the predictive power. Our method also adopts the idea of learning and aligning latent spaces for both features and labels. In contrast to previous works based on a unimodal prior, C-GMVAE imposes a Gaussian mixture structure on the latent space, to alleviate the posterior collapse and over-regularization issues. C-GMVAE outperforms existing methods on multiple public datasets and can often match other models\u2019 full performance with only 50% of the training data. Furthermore, we show that the learnt embeddings provide insights into the interpretation of label-label interactions.}\n}", "pdf": "https://proceedings.mlr.press/v162/bai22c/bai22c.pdf", "supp": "", "pdf_size": 596298, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9275720515589327599&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, Cornell University; Department of Computer Science, Cornell University; Department of Computer Science, Cornell University", "aff_domain": "cornell.edu;cornell.edu;cornell.edu", "email": "cornell.edu;cornell.edu;cornell.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/bai22c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Gaussian Process Uniform Error Bounds with Unknown Hyperparameters for Safety-Critical Applications", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16023", "id": "16023", "proceeding": "https://proceedings.mlr.press/v162/capone22a.html", "poster": "/media/PosterPDFs/ICML%202022/6c468ec5a41d65815de23ec1d08d7951.png?t=1658099229.4874067", "slides": "", "author_site": "Alexandre Capone, Armin Lederer, Sandra Hirche", "author": "Alexandre Capone; Armin Lederer; Sandra Hirche", "abstract": "Gaussian processes have become a promising tool for various safety-critical settings, since the posterior variance can be used to directly estimate the model error and quantify risk. However, state-of-the-art techniques for safety-critical settings hinge on the assumption that the kernel hyperparameters are known, which does not apply in general. To mitigate this, we introduce robust Gaussian process uniform error bounds in settings with unknown hyperparameters. Our approach computes a confidence region in the space of hyperparameters, which enables us to obtain a probabilistic upper bound for the model error of a Gaussian process with arbitrary hyperparameters. We do not require to know any bounds for the hyperparameters a priori, which is an assumption commonly found in related work. Instead, we are able to derive bounds from data in an intuitive fashion. We additionally employ the proposed technique to derive performance guarantees for a class of learning-based control problems. Experiments show that the bound performs significantly better than vanilla and fully Bayesian Gaussian processes.", "bibtex": "@InProceedings{pmlr-v162-capone22a,\n title = \t {{G}aussian Process Uniform Error Bounds with Unknown Hyperparameters for Safety-Critical Applications},\n author = {Capone, Alexandre and Lederer, Armin and Hirche, Sandra},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2609--2624},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/capone22a/capone22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/capone22a.html},\n abstract = \t {Gaussian processes have become a promising tool for various safety-critical settings, since the posterior variance can be used to directly estimate the model error and quantify risk. However, state-of-the-art techniques for safety-critical settings hinge on the assumption that the kernel hyperparameters are known, which does not apply in general. To mitigate this, we introduce robust Gaussian process uniform error bounds in settings with unknown hyperparameters. Our approach computes a confidence region in the space of hyperparameters, which enables us to obtain a probabilistic upper bound for the model error of a Gaussian process with arbitrary hyperparameters. We do not require to know any bounds for the hyperparameters a priori, which is an assumption commonly found in related work. Instead, we are able to derive bounds from data in an intuitive fashion. We additionally employ the proposed technique to derive performance guarantees for a class of learning-based control problems. Experiments show that the bound performs significantly better than vanilla and fully Bayesian Gaussian processes.}\n}", "pdf": "https://proceedings.mlr.press/v162/capone22a/capone22a.pdf", "supp": "", "pdf_size": 830541, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10619138412695371190&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "TUM School of Computation, Information and Technology, Technical University of Munich, Munich, Germany; TUM School of Computation, Information and Technology, Technical University of Munich, Munich, Germany; TUM School of Computation, Information and Technology, Technical University of Munich, Munich, Germany", "aff_domain": "tum.de; ; ", "email": "tum.de; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/capone22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Technical University of Munich", "aff_unique_dep": "School of Computation, Information and Technology", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Munich", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "GenLabel: Mixup Relabeling using Generative Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16521", "id": "16521", "proceeding": "https://proceedings.mlr.press/v162/sohn22a.html", "poster": "/media/PosterPDFs/ICML%202022/3621f1454cacf995530ea53652ddf8fb_PpqEx7I.png?t=1657462609.170413", "slides": "", "author_site": "Jy yong Sohn, Liang Shang, Hongxu Chen, Jaekyun Moon, Dimitris Papailiopoulos, Kangwook Lee", "author": "Jy-Yong Sohn; Liang Shang; Hongxu Chen; Jaekyun Moon; Dimitris Papailiopoulos; Kangwook Lee", "abstract": "Mixup is a data augmentation method that generates new data points by mixing a pair of input data. While mixup generally improves the prediction performance, it sometimes degrades the performance. In this paper, we first identify the main causes of this phenomenon by theoretically and empirically analyzing the mixup algorithm. To resolve this, we propose GenLabel, a simple yet effective relabeling algorithm designed for mixup. In particular, GenLabel helps the mixup algorithm correctly label mixup samples by learning the class-conditional data distribution using generative models. Via theoretical and empirical analysis, we show that mixup, when used together with GenLabel, can effectively resolve the aforementioned phenomenon, improving the accuracy of mixup-trained model.", "bibtex": "@InProceedings{pmlr-v162-sohn22a,\n title = \t {{G}en{L}abel: Mixup Relabeling using Generative Models},\n author = {Sohn, Jy-Yong and Shang, Liang and Chen, Hongxu and Moon, Jaekyun and Papailiopoulos, Dimitris and Lee, Kangwook},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20278--20313},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sohn22a/sohn22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sohn22a.html},\n abstract = \t {Mixup is a data augmentation method that generates new data points by mixing a pair of input data. While mixup generally improves the prediction performance, it sometimes degrades the performance. In this paper, we first identify the main causes of this phenomenon by theoretically and empirically analyzing the mixup algorithm. To resolve this, we propose GenLabel, a simple yet effective relabeling algorithm designed for mixup. In particular, GenLabel helps the mixup algorithm correctly label mixup samples by learning the class-conditional data distribution using generative models. Via theoretical and empirical analysis, we show that mixup, when used together with GenLabel, can effectively resolve the aforementioned phenomenon, improving the accuracy of mixup-trained model.}\n}", "pdf": "https://proceedings.mlr.press/v162/sohn22a/sohn22a.pdf", "supp": "", "pdf_size": 10598169, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3023782389298750158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Electrical and Computer Engineering, University of Wisconsin, Madison, USA; Department of Electrical and Computer Engineering, University of Wisconsin, Madison, USA; Department of Electrical and Computer Engineering, University of Wisconsin, Madison, USA; School of Electrical Engineering, Daejeon, KAIST; Department of Electrical and Computer Engineering, University of Wisconsin, Madison, USA; Department of Electrical and Computer Engineering, University of Wisconsin, Madison, USA", "aff_domain": "gmail.com; ; ; ; ;gmail.com", "email": "gmail.com; ; ; ; ;gmail.com", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/sohn22a.html", "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "University of Wisconsin-Madison;KAIST", "aff_unique_dep": "Department of Electrical and Computer Engineering;School of Electrical Engineering", "aff_unique_url": "https://www.wisc.edu;https://www.kaist.ac.kr", "aff_unique_abbr": "UW-Madison;KAIST", "aff_campus_unique_index": "0;0;0;1;0;0", "aff_campus_unique": "Madison;Daejeon", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "United States;South Korea" }, { "title": "General-purpose, long-context autoregressive modeling with Perceiver AR", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17885", "id": "17885", "proceeding": "https://proceedings.mlr.press/v162/hawthorne22a.html", "poster": "/media/PosterPDFs/ICML%202022/0613239e122094abb4ef998c01d16958.png?t=1657574003.4825418", "slides": "", "author_site": "Curtis Hawthorne, Andrew Jaegle, C\u0103t\u0103lina Cangea, Sebastian Borgeaud, Charlie Nash, Mateusz Malinowski, Sander Dieleman, Oriol Vinyals, Matthew Botvinick, Ian Simon, Hannah Sheahan, Neil Zeghidour, Jean-Baptiste Alayrac, Joao Carreira, Jesse Engel", "author": "Curtis Hawthorne; Andrew Jaegle; C\u0103t\u0103lina Cangea; Sebastian Borgeaud; Charlie Nash; Mateusz Malinowski; Sander Dieleman; Oriol Vinyals; Matthew Botvinick; Ian Simon; Hannah Sheahan; Neil Zeghidour; Jean-Baptiste Alayrac; Joao Carreira; Jesse Engel", "abstract": "Real-world data is high-dimensional: a book, image, or musical performance can easily contain hundreds of thousands of elements even after compression. However, the most commonly used autoregressive models, Transformers, are prohibitively expensive to scale to the number of inputs and layers needed to capture this long-range structure. We develop Perceiver AR, an autoregressive, modality-agnostic architecture which uses cross-attention to map long-range inputs to a small number of latents while also maintaining end-to-end causal masking. Perceiver AR can directly attend to over a hundred thousand tokens, enabling practical long-context density estimation without the need for hand-crafted sparsity patterns or memory mechanisms. When trained on images or music, Perceiver AR generates outputs with clear long-term coherence and structure. Our architecture also obtains state-of-the-art likelihood on long-sequence benchmarks, including 64x64 ImageNet images and PG-19 books.", "bibtex": "@InProceedings{pmlr-v162-hawthorne22a,\n title = \t {General-purpose, long-context autoregressive modeling with Perceiver {AR}},\n author = {Hawthorne, Curtis and Jaegle, Andrew and Cangea, C{\\u{a}}t{\\u{a}}lina and Borgeaud, Sebastian and Nash, Charlie and Malinowski, Mateusz and Dieleman, Sander and Vinyals, Oriol and Botvinick, Matthew and Simon, Ian and Sheahan, Hannah and Zeghidour, Neil and Alayrac, Jean-Baptiste and Carreira, Joao and Engel, Jesse},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8535--8558},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hawthorne22a/hawthorne22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hawthorne22a.html},\n abstract = \t {Real-world data is high-dimensional: a book, image, or musical performance can easily contain hundreds of thousands of elements even after compression. However, the most commonly used autoregressive models, Transformers, are prohibitively expensive to scale to the number of inputs and layers needed to capture this long-range structure. We develop Perceiver AR, an autoregressive, modality-agnostic architecture which uses cross-attention to map long-range inputs to a small number of latents while also maintaining end-to-end causal masking. Perceiver AR can directly attend to over a hundred thousand tokens, enabling practical long-context density estimation without the need for hand-crafted sparsity patterns or memory mechanisms. When trained on images or music, Perceiver AR generates outputs with clear long-term coherence and structure. Our architecture also obtains state-of-the-art likelihood on long-sequence benchmarks, including 64x64 ImageNet images and PG-19 books.}\n}", "pdf": "https://proceedings.mlr.press/v162/hawthorne22a/hawthorne22a.pdf", "supp": "", "pdf_size": 3656835, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1307821423265105144&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Google Research, Brain Team+DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; Google Research, Brain Team; DeepMind; Google Research, Brain Team+DeepMind; DeepMind; DeepMind; Google Research, Brain Team+DeepMind", "aff_domain": "google.com;deepmind.com; ; ; ; ; ; ; ;google.com; ;google.com; ; ;google.com", "email": "google.com;deepmind.com; ; ; ; ; ; ; ;google.com; ;google.com; ; ;google.com", "github": "", "project": "", "author_num": 15, "oa": "https://proceedings.mlr.press/v162/hawthorne22a.html", "aff_unique_index": "0+1;1;1;1;1;1;1;1;1;0;1;0+1;1;1;0+1", "aff_unique_norm": "Google;DeepMind", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://deepmind.com", "aff_unique_abbr": "Google;DeepMind", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0+1;1;1;1;1;1;1;1;1;0;1;0+1;1;1;0+1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Generalised Policy Improvement with Geometric Policy Composition", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17511", "id": "17511", "proceeding": "https://proceedings.mlr.press/v162/thakoor22a.html", "poster": "/media/PosterPDFs/ICML%202022/1f029c1e1abaaf0605807b7f91552d36.png?t=1658244537.6480985", "slides": "", "author_site": "Shantanu Thakoor, Mark Rowland, Diana Borsa, Will Dabney, Remi Munos, Andre Barreto", "author": "Shantanu Thakoor; Mark Rowland; Diana Borsa; Will Dabney; Remi Munos; Andre Barreto", "abstract": "We introduce a method for policy improvement that interpolates between the greedy approach of value-based reinforcement learning (RL) and the full planning approach typical of model-based RL. The new method builds on the concept of a geometric horizon model (GHM, also known as a \\gamma-model), which models the discounted state-visitation distribution of a given policy. We show that we can evaluate any non-Markov policy that switches between a set of base Markov policies with fixed probability by a careful composition of the base policy GHMs, without any additional learning. We can then apply generalised policy improvement (GPI) to collections of such non-Markov policies to obtain a new Markov policy that will in general outperform its precursors. We provide a thorough theoretical analysis of this approach, develop applications to transfer and standard RL, and empirically demonstrate its effectiveness over standard GPI on a challenging deep RL continuous control task. We also provide an analysis of GHM training methods, proving a novel convergence result regarding previously proposed methods and showing how to train these models stably in deep RL settings.", "bibtex": "@InProceedings{pmlr-v162-thakoor22a,\n title = \t {Generalised Policy Improvement with Geometric Policy Composition},\n author = {Thakoor, Shantanu and Rowland, Mark and Borsa, Diana and Dabney, Will and Munos, Remi and Barreto, Andre},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21272--21307},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/thakoor22a/thakoor22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/thakoor22a.html},\n abstract = \t {We introduce a method for policy improvement that interpolates between the greedy approach of value-based reinforcement learning (RL) and the full planning approach typical of model-based RL. The new method builds on the concept of a geometric horizon model (GHM, also known as a \\gamma-model), which models the discounted state-visitation distribution of a given policy. We show that we can evaluate any non-Markov policy that switches between a set of base Markov policies with fixed probability by a careful composition of the base policy GHMs, without any additional learning. We can then apply generalised policy improvement (GPI) to collections of such non-Markov policies to obtain a new Markov policy that will in general outperform its precursors. We provide a thorough theoretical analysis of this approach, develop applications to transfer and standard RL, and empirically demonstrate its effectiveness over standard GPI on a challenging deep RL continuous control task. We also provide an analysis of GHM training methods, proving a novel convergence result regarding previously proposed methods and showing how to train these models stably in deep RL settings.}\n}", "pdf": "https://proceedings.mlr.press/v162/thakoor22a/thakoor22a.pdf", "supp": "", "pdf_size": 1915112, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3702679662928020842&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind", "aff_domain": "deepmind.com;deepmind.com; ; ; ; ", "email": "deepmind.com;deepmind.com; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/thakoor22a.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Generalization Bounds using Lower Tail Exponents in Stochastic Optimizers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16195", "id": "16195", "proceeding": "https://proceedings.mlr.press/v162/hodgkinson22a.html", "poster": "/media/PosterPDFs/ICML%202022/812b4ba287f5ee0bc9d43bbf5bbe87fb.png?t=1657253475.909405", "slides": "/media/icml-2022/Slides/16195.pdf", "author_site": "Liam Hodgkinson, Umut Simsekli, Rajiv Khanna, Michael Mahoney", "author": "Liam Hodgkinson; Umut Simsekli; Rajiv Khanna; Michael Mahoney", "abstract": "Despite the ubiquitous use of stochastic optimization algorithms in machine learning, the precise impact of these algorithms and their dynamics on generalization performance in realistic non-convex settings is still poorly understood. While recent work has revealed connections between generalization and heavy-tailed behavior in stochastic optimization, they mainly relied on continuous-time approximations; and a rigorous treatment for the original discrete-time iterations is yet to be performed. To bridge this gap, we present novel bounds linking generalization to the lower tail exponent of the transition kernel associated with the optimizer around a local minimum, in both discrete- and continuous-time settings. To achieve this, we first prove a data- and algorithm-dependent generalization bound in terms of the celebrated Fernique-Talagrand functional applied to the trajectory of the optimizer. Then, we specialize this result by exploiting the Markovian structure of stochastic optimizers, and derive bounds in terms of their (data-dependent) transition kernels. We support our theory with empirical results from a variety of neural networks, showing correlations between generalization error and lower tail exponents.", "bibtex": "@InProceedings{pmlr-v162-hodgkinson22a,\n title = \t {Generalization Bounds using Lower Tail Exponents in Stochastic Optimizers},\n author = {Hodgkinson, Liam and Simsekli, Umut and Khanna, Rajiv and Mahoney, Michael},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8774--8795},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hodgkinson22a/hodgkinson22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hodgkinson22a.html},\n abstract = \t {Despite the ubiquitous use of stochastic optimization algorithms in machine learning, the precise impact of these algorithms and their dynamics on generalization performance in realistic non-convex settings is still poorly understood. While recent work has revealed connections between generalization and heavy-tailed behavior in stochastic optimization, they mainly relied on continuous-time approximations; and a rigorous treatment for the original discrete-time iterations is yet to be performed. To bridge this gap, we present novel bounds linking generalization to the lower tail exponent of the transition kernel associated with the optimizer around a local minimum, in both discrete- and continuous-time settings. To achieve this, we first prove a data- and algorithm-dependent generalization bound in terms of the celebrated Fernique-Talagrand functional applied to the trajectory of the optimizer. Then, we specialize this result by exploiting the Markovian structure of stochastic optimizers, and derive bounds in terms of their (data-dependent) transition kernels. We support our theory with empirical results from a variety of neural networks, showing correlations between generalization error and lower tail exponents.}\n}", "pdf": "https://proceedings.mlr.press/v162/hodgkinson22a/hodgkinson22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/hodgkinson22a-supp.zip", "pdf_size": 1209573, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11215041435496699178&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "ICSI and Department of Statistics, University of California, Berkeley, USA; INRIA \u2014 D\u00e9partement d\u2019Informatique de l\u2019\u00c9cole Normale Sup\u00e9rieure, PSL Research University, Paris, France; Department of Computer Science, Purdue University, Indiana, USA; ICSI and Department of Statistics, University of California, Berkeley, USA", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/hodgkinson22a.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of California, Berkeley;INRIA;Purdue University", "aff_unique_dep": "Department of Statistics;D\u00e9partement d\u2019Informatique de l\u2019\u00c9cole Normale Sup\u00e9rieure;Department of Computer Science", "aff_unique_url": "https://www.berkeley.edu;https://www.inria.fr;https://www.purdue.edu", "aff_unique_abbr": "UC Berkeley;INRIA;Purdue", "aff_campus_unique_index": "0;1;2;0", "aff_campus_unique": "Berkeley;Paris;Indiana", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;France" }, { "title": "Generalization Guarantee of Training Graph Convolutional Networks with Graph Topology Sampling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16763", "id": "16763", "proceeding": "https://proceedings.mlr.press/v162/li22u.html", "poster": "/media/PosterPDFs/ICML%202022/c9dd73f5cb96486f5e1e0680e841a550_izrWtMG.png?t=1657217075.3109508", "slides": "", "author_site": "Hongkang Li, Meng Wang, Sijia Liu, Pin-Yu Chen, Jinjun Xiong", "author": "Hongkang Li; Meng Wang; Sijia Liu; Pin-Yu Chen; Jinjun Xiong", "abstract": "Graph convolutional networks (GCNs) have recently achieved great empirical success in learning graph-structured data. To address its scalability issue due to the recursive embedding of neighboring features, graph topology sampling has been proposed to reduce the memory and computational cost of training GCNs, and it has achieved comparable test performance to those without topology sampling in many empirical studies. To the best of our knowledge, this paper provides the first theoretical justification of graph topology sampling in training (up to) three-layer GCNs for semi-supervised node classification. We formally characterize some sufficient conditions on graph topology sampling such that GCN training leads to diminishing generalization error. Moreover, our method tackles the non-convex interaction of weights across layers, which is under-explored in the existing theoretical analyses of GCNs. This paper characterizes the impact of graph structures and topology sampling on the generalization performance and sample complexity explicitly, and the theoretical findings are also justified through numerical experiments.", "bibtex": "@InProceedings{pmlr-v162-li22u,\n title = \t {Generalization Guarantee of Training Graph Convolutional Networks with Graph Topology Sampling},\n author = {Li, Hongkang and Wang, Meng and Liu, Sijia and Chen, Pin-Yu and Xiong, Jinjun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13014--13051},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22u/li22u.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22u.html},\n abstract = \t {Graph convolutional networks (GCNs) have recently achieved great empirical success in learning graph-structured data. To address its scalability issue due to the recursive embedding of neighboring features, graph topology sampling has been proposed to reduce the memory and computational cost of training GCNs, and it has achieved comparable test performance to those without topology sampling in many empirical studies. To the best of our knowledge, this paper provides the first theoretical justification of graph topology sampling in training (up to) three-layer GCNs for semi-supervised node classification. We formally characterize some sufficient conditions on graph topology sampling such that GCN training leads to diminishing generalization error. Moreover, our method tackles the non-convex interaction of weights across layers, which is under-explored in the existing theoretical analyses of GCNs. This paper characterizes the impact of graph structures and topology sampling on the generalization performance and sample complexity explicitly, and the theoretical findings are also justified through numerical experiments.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22u/li22u.pdf", "supp": "", "pdf_size": 1434128, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5364219507284501997&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Electrical, Computer, and System Engineering, Rensselaer Polytechnic Institute, NY, USA; Department of Electrical, Computer, and System Engineering, Rensselaer Polytechnic Institute, NY, USA; Department of Computer Science and Engineering, Michigan State University, MI, USA+MIT-IBM Watson AI Lab, IBM Research, MA, USA; IBM Thomas J. Watson Research Center, Yorktown Heights, NY, USA; Department of Computer Science and Engineering, University at Buffalo, NY, USA", "aff_domain": "rpi.edu;rpi.edu;msu.edu;ibm.com;buffalo.edu", "email": "rpi.edu;rpi.edu;msu.edu;ibm.com;buffalo.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/li22u.html", "aff_unique_index": "0;0;1+2;2;3", "aff_unique_norm": "Rensselaer Polytechnic Institute;Michigan State University;IBM;University at Buffalo", "aff_unique_dep": "Department of Electrical, Computer, and System Engineering;Department of Computer Science and Engineering;AI Lab;Department of Computer Science and Engineering", "aff_unique_url": "https://www.rpi.edu;https://www.msu.edu;;https://www.buffalo.edu", "aff_unique_abbr": "RPI;MSU;MIT-IBM AI Lab;UB", "aff_campus_unique_index": "0;0;1;3;4", "aff_campus_unique": "NY;East Lansing;;Yorktown Heights;Buffalo", "aff_country_unique_index": "0;0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "Generalization and Robustness Implications in Object-Centric Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18259", "id": "18259", "proceeding": "https://proceedings.mlr.press/v162/dittadi22a.html", "poster": "", "slides": "", "author_site": "Andrea Dittadi, Samuele Papa, Michele De Vita, Bernhard Sch\u00f6lkopf, Ole Winther, Francesco Locatello", "author": "Andrea Dittadi; Samuele S Papa; Michele De Vita; Bernhard Sch\u00f6lkopf; Ole Winther; Francesco Locatello", "abstract": "The idea behind object-centric representation learning is that natural scenes can better be modeled as compositions of objects and their relations as opposed to distributed representations. This inductive bias can be injected into neural networks to potentially improve systematic generalization and performance of downstream tasks in scenes with multiple objects. In this paper, we train state-of-the-art unsupervised models on five common multi-object datasets and evaluate segmentation metrics and downstream object property prediction. In addition, we study generalization and robustness by investigating the settings where either a single object is out of distribution \u2013 e.g., having an unseen color, texture, or shape \u2013 or global properties of the scene are altered \u2013 e.g., by occlusions, cropping, or increasing the number of objects. From our experimental study, we find object-centric representations to be useful for downstream tasks and generally robust to most distribution shifts affecting objects. However, when the distribution shift affects the input in a less structured manner, robustness in terms of segmentation and downstream task performance may vary significantly across models and distribution shifts.", "bibtex": "@InProceedings{pmlr-v162-dittadi22a,\n title = \t {Generalization and Robustness Implications in Object-Centric Learning},\n author = {Dittadi, Andrea and Papa, Samuele S and De Vita, Michele and Sch{\\\"o}lkopf, Bernhard and Winther, Ole and Locatello, Francesco},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5221--5285},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dittadi22a/dittadi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dittadi22a.html},\n abstract = \t {The idea behind object-centric representation learning is that natural scenes can better be modeled as compositions of objects and their relations as opposed to distributed representations. This inductive bias can be injected into neural networks to potentially improve systematic generalization and performance of downstream tasks in scenes with multiple objects. In this paper, we train state-of-the-art unsupervised models on five common multi-object datasets and evaluate segmentation metrics and downstream object property prediction. In addition, we study generalization and robustness by investigating the settings where either a single object is out of distribution \u2013 e.g., having an unseen color, texture, or shape \u2013 or global properties of the scene are altered \u2013 e.g., by occlusions, cropping, or increasing the number of objects. From our experimental study, we find object-centric representations to be useful for downstream tasks and generally robust to most distribution shifts affecting objects. However, when the distribution shift affects the input in a less structured manner, robustness in terms of segmentation and downstream task performance may vary significantly across models and distribution shifts.}\n}", "pdf": "https://proceedings.mlr.press/v162/dittadi22a/dittadi22a.pdf", "supp": "", "pdf_size": 16900848, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9362373326387424526&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Technical University of Denmark+Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; Technical University of Denmark; Technical University of Denmark; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany+University of Copenhagen+Rigshospitalet, Copenhagen University Hospital; University of Copenhagen+Rigshospitalet, Copenhagen University Hospital; Amazon", "aff_domain": "dtu.dk; ; ; ; ; ", "email": "dtu.dk; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/dittadi22a.html", "aff_unique_index": "0+1;0;0;1+2+3;2+3;4", "aff_unique_norm": "Technical University of Denmark;Max Planck Institute for Intelligent Systems;University of Copenhagen;Copenhagen University Hospital;Amazon", "aff_unique_dep": ";;;;Amazon.com, Inc.", "aff_unique_url": "https://www.tek.dk;https://www.mpi-is.mpg.de;https://www.ku.dk;https://www.rigshospitalet.dk;https://www.amazon.com", "aff_unique_abbr": "DTU;MPI-IS;UCPH;;Amazon", "aff_campus_unique_index": "1;1+2;2", "aff_campus_unique": ";T\u00fcbingen;Copenhagen", "aff_country_unique_index": "0+1;0;0;1+0+0;0+0;2", "aff_country_unique": "Denmark;Germany;United States" }, { "title": "Generalized Beliefs for Cooperative AI", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17845", "id": "17845", "proceeding": "https://proceedings.mlr.press/v162/muglich22a.html", "poster": "/media/PosterPDFs/ICML%202022/5a9542c773018268fc6271f7afeea969.png?t=1657824029.7422264", "slides": "", "author_site": "Darius Muglich, Luisa Zintgraf, Christian Schroeder de Witt, Shimon Whiteson, Jakob Foerster", "author": "Darius Muglich; Luisa M Zintgraf; Christian A Schroeder De Witt; Shimon Whiteson; Jakob Foerster", "abstract": "Self-play is a common method for constructing solutions in Markov games that can yield optimal policies in collaborative settings. However, these policies often adopt highly-specialized conventions that make playing with a novel partner difficult. To address this, recent approaches rely on encoding symmetry and convention-awareness into policy training, but these require strong environmental assumptions and can complicate policy training. To overcome this, we propose moving the learning of conventions to the belief space. Specifically, we propose a belief learning paradigm that can maintain beliefs over rollouts of policies not seen at training time, and can thus decode and adapt to novel conventions at test time. We show how to leverage this belief model for both search and training of a best response over a pool of policies to greatly improve zero-shot coordination. We also show how our paradigm promotes explainability and interpretability of nuanced agent conventions.", "bibtex": "@InProceedings{pmlr-v162-muglich22a,\n title = \t {Generalized Beliefs for Cooperative {AI}},\n author = {Muglich, Darius and Zintgraf, Luisa M and De Witt, Christian A Schroeder and Whiteson, Shimon and Foerster, Jakob},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16062--16082},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/muglich22a/muglich22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/muglich22a.html},\n abstract = \t {Self-play is a common method for constructing solutions in Markov games that can yield optimal policies in collaborative settings. However, these policies often adopt highly-specialized conventions that make playing with a novel partner difficult. To address this, recent approaches rely on encoding symmetry and convention-awareness into policy training, but these require strong environmental assumptions and can complicate policy training. To overcome this, we propose moving the learning of conventions to the belief space. Specifically, we propose a belief learning paradigm that can maintain beliefs over rollouts of policies not seen at training time, and can thus decode and adapt to novel conventions at test time. We show how to leverage this belief model for both search and training of a best response over a pool of policies to greatly improve zero-shot coordination. We also show how our paradigm promotes explainability and interpretability of nuanced agent conventions.}\n}", "pdf": "https://proceedings.mlr.press/v162/muglich22a/muglich22a.pdf", "supp": "", "pdf_size": 4192647, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5373823282738718621&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of Oxford; University of Oxford; University of Oxford; University of Oxford; University of Oxford", "aff_domain": "yahoo.com; ; ; ; ", "email": "yahoo.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/muglich22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Generalized Data Distribution Iteration", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17793", "id": "17793", "proceeding": "https://proceedings.mlr.press/v162/fan22c.html", "poster": "/media/PosterPDFs/ICML%202022/1091660f3dff84fd648efe31391c5524_S5scOTH.png?t=1654591403.6659465", "slides": "/media/icml-2022/Slides/17793_14uK7ZN.pdf", "author_site": "Jiajun Fan, Changnan Xiao", "author": "Jiajun Fan; Changnan Xiao", "abstract": "To obtain higher sample efficiency and superior final performance simultaneously has been one of the major challenges for deep reinforcement learning (DRL). Previous work could handle one of these challenges but typically failed to address them concurrently. In this paper, we try to tackle these two challenges simultaneously. To achieve this, we firstly decouple these challenges into two classic RL problems: data richness and exploration-exploitation trade-off. Then, we cast these two problems into the training data distribution optimization problem, namely to obtain desired training data within limited interactions, and address them concurrently via i) explicit modeling and control of the capacity and diversity of behavior policy and ii) more fine-grained and adaptive control of selective/sampling distribution of the behavior policy using a monotonic data distribution optimization. Finally, we integrate this process into Generalized Policy Iteration (GPI) and obtain a more general framework called Generalized Data Distribution Iteration (GDI). We use the GDI framework to introduce operator-based versions of well-known RL methods from DQN to Agent57. Theoretical guarantee of the superiority of GDI compared with GPI is concluded. We also demonstrate our state-of-the-art (SOTA) performance on Arcade Learning Environment (ALE), wherein our algorithm has achieved 9620.98% mean human normalized score (HNS), 1146.39% median HNS, and surpassed 22 human world records using only 200M training frames. Our performance is comparable to Agent57\u2019s while we consume 500 times less data. We argue that there is still a long way to go before obtaining real superhuman agents in ALE.", "bibtex": "@InProceedings{pmlr-v162-fan22c,\n title = \t {Generalized Data Distribution Iteration},\n author = {Fan, Jiajun and Xiao, Changnan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6103--6184},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fan22c/fan22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/fan22c.html},\n abstract = \t {To obtain higher sample efficiency and superior final performance simultaneously has been one of the major challenges for deep reinforcement learning (DRL). Previous work could handle one of these challenges but typically failed to address them concurrently. In this paper, we try to tackle these two challenges simultaneously. To achieve this, we firstly decouple these challenges into two classic RL problems: data richness and exploration-exploitation trade-off. Then, we cast these two problems into the training data distribution optimization problem, namely to obtain desired training data within limited interactions, and address them concurrently via i) explicit modeling and control of the capacity and diversity of behavior policy and ii) more fine-grained and adaptive control of selective/sampling distribution of the behavior policy using a monotonic data distribution optimization. Finally, we integrate this process into Generalized Policy Iteration (GPI) and obtain a more general framework called Generalized Data Distribution Iteration (GDI). We use the GDI framework to introduce operator-based versions of well-known RL methods from DQN to Agent57. Theoretical guarantee of the superiority of GDI compared with GPI is concluded. We also demonstrate our state-of-the-art (SOTA) performance on Arcade Learning Environment (ALE), wherein our algorithm has achieved 9620.98% mean human normalized score (HNS), 1146.39% median HNS, and surpassed 22 human world records using only 200M training frames. Our performance is comparable to Agent57\u2019s while we consume 500 times less data. We argue that there is still a long way to go before obtaining real superhuman agents in ALE.}\n}", "pdf": "https://proceedings.mlr.press/v162/fan22c/fan22c.pdf", "supp": "", "pdf_size": 10718207, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4459240977293833974&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Tsinghua Shenzhen International Graduate School, Tsinghua University, Beijing, China; ByteDance, Beijing, China", "aff_domain": "mails.tsinghua.edu.cn;bytedance.com", "email": "mails.tsinghua.edu.cn;bytedance.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/fan22c.html", "aff_unique_index": "0;1", "aff_unique_norm": "Tsinghua University;ByteDance", "aff_unique_dep": "International Graduate School;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.bytedance.com", "aff_unique_abbr": "THU;ByteDance", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Shenzhen;Beijing", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Generalized Federated Learning via Sharpness Aware Minimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16133", "id": "16133", "proceeding": "https://proceedings.mlr.press/v162/qu22a.html", "poster": "/media/PosterPDFs/ICML%202022/70afbf2259b4449d8ae1429e054df1b1.png?t=1657415808.4266086", "slides": "", "author_site": "Zhe Qu, Xingyu Li, Rui Duan, Yao Liu, Bo Tang, Zhuo Lu", "author": "Zhe Qu; Xingyu Li; Rui Duan; Yao Liu; Bo Tang; Zhuo Lu", "abstract": "Federated Learning (FL) is a promising framework for performing privacy-preserving, distributed learning with a set of clients. However, the data distribution among clients often exhibits non-IID, i.e., distribution shift, which makes efficient optimization difficult. To tackle this problem, many FL algorithms focus on mitigating the effects of data heterogeneity across clients by increasing the performance of the global model. However, almost all algorithms leverage Empirical Risk Minimization (ERM) to be the local optimizer, which is easy to make the global model fall into a sharp valley and increase a large deviation of parts of local clients. Therefore, in this paper, we revisit the solutions to the distribution shift problem in FL with a focus on local learning generality. To this end, we propose a general, effective algorithm, \\texttt{FedSAM}, based on Sharpness Aware Minimization (SAM) local optimizer, and develop a momentum FL algorithm to bridge local and global models, \\texttt{MoFedSAM}. Theoretically, we show the convergence analysis of these two algorithms and demonstrate the generalization bound of \\texttt{FedSAM}. Empirically, our proposed algorithms substantially outperform existing FL studies and significantly decrease the learning deviation.", "bibtex": "@InProceedings{pmlr-v162-qu22a,\n title = \t {Generalized Federated Learning via Sharpness Aware Minimization},\n author = {Qu, Zhe and Li, Xingyu and Duan, Rui and Liu, Yao and Tang, Bo and Lu, Zhuo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18250--18280},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qu22a/qu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/qu22a.html},\n abstract = \t {Federated Learning (FL) is a promising framework for performing privacy-preserving, distributed learning with a set of clients. However, the data distribution among clients often exhibits non-IID, i.e., distribution shift, which makes efficient optimization difficult. To tackle this problem, many FL algorithms focus on mitigating the effects of data heterogeneity across clients by increasing the performance of the global model. However, almost all algorithms leverage Empirical Risk Minimization (ERM) to be the local optimizer, which is easy to make the global model fall into a sharp valley and increase a large deviation of parts of local clients. Therefore, in this paper, we revisit the solutions to the distribution shift problem in FL with a focus on local learning generality. To this end, we propose a general, effective algorithm, \\texttt{FedSAM}, based on Sharpness Aware Minimization (SAM) local optimizer, and develop a momentum FL algorithm to bridge local and global models, \\texttt{MoFedSAM}. Theoretically, we show the convergence analysis of these two algorithms and demonstrate the generalization bound of \\texttt{FedSAM}. Empirically, our proposed algorithms substantially outperform existing FL studies and significantly decrease the learning deviation.}\n}", "pdf": "https://proceedings.mlr.press/v162/qu22a/qu22a.pdf", "supp": "", "pdf_size": 972815, "gs_citation": 182, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16424243866560583880&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/qu22a.html" }, { "title": "Generalized Leverage Scores: Geometric Interpretation and Applications", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17977", "id": "17977", "proceeding": "https://proceedings.mlr.press/v162/ordozgoiti22a.html", "poster": "", "slides": "", "author_site": "Bruno Ordozgoiti, Antonis Matakos, Aristides Gionis", "author": "Bruno Ordozgoiti; Antonis Matakos; Aristides Gionis", "abstract": "In problems involving matrix computations, the concept of leverage has found a large number of applications. In particular, leverage scores, which relate the columns of a matrix to the subspaces spanned by its leading singular vectors, are helpful in revealing column subsets to approximately factorize a matrix with quality guarantees. As such, they provide a solid foundation for a variety of machine-learning methods. In this paper we extend the definition of leverage scores to relate the columns of a matrix to arbitrary subsets of singular vectors. We establish a precise connection between column and singular-vector subsets, by relating the concepts of leverage scores and principal angles between subspaces. We employ this result to design approximation algorithms with provable guarantees for two well-known problems: generalized column subset selection and sparse canonical correlation analysis. We run numerical experiments to provide further insight on the proposed methods. The novel bounds we derive improve our understanding of fundamental concepts in matrix approximations. In addition, our insights may serve as building blocks for further contributions.", "bibtex": "@InProceedings{pmlr-v162-ordozgoiti22a,\n title = \t {Generalized Leverage Scores: Geometric Interpretation and Applications},\n author = {Ordozgoiti, Bruno and Matakos, Antonis and Gionis, Aristides},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17056--17070},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ordozgoiti22a/ordozgoiti22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ordozgoiti22a.html},\n abstract = \t {In problems involving matrix computations, the concept of leverage has found a large number of applications. In particular, leverage scores, which relate the columns of a matrix to the subspaces spanned by its leading singular vectors, are helpful in revealing column subsets to approximately factorize a matrix with quality guarantees. As such, they provide a solid foundation for a variety of machine-learning methods. In this paper we extend the definition of leverage scores to relate the columns of a matrix to arbitrary subsets of singular vectors. We establish a precise connection between column and singular-vector subsets, by relating the concepts of leverage scores and principal angles between subspaces. We employ this result to design approximation algorithms with provable guarantees for two well-known problems: generalized column subset selection and sparse canonical correlation analysis. We run numerical experiments to provide further insight on the proposed methods. The novel bounds we derive improve our understanding of fundamental concepts in matrix approximations. In addition, our insights may serve as building blocks for further contributions.}\n}", "pdf": "https://proceedings.mlr.press/v162/ordozgoiti22a/ordozgoiti22a.pdf", "supp": "", "pdf_size": 2400501, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5219217794451144650&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "School of Electronic Engineering and Computer Science, Queen Mary University of London, United Kingdom; Department of Computer Science, Aalto University, Finland; Division of Theoretical Computer Science, KTH Royal Institute of Technology, Sweden", "aff_domain": "qmul.ac.uk; ; ", "email": "qmul.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/ordozgoiti22a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Queen Mary University of London;Aalto University;KTH Royal Institute of Technology", "aff_unique_dep": "School of Electronic Engineering and Computer Science;Department of Computer Science;Division of Theoretical Computer Science", "aff_unique_url": "https://www.qmul.ac.uk;https://www.aalto.fi;https://www.kth.se", "aff_unique_abbr": "QMUL;Aalto;KTH", "aff_campus_unique_index": "0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United Kingdom;Finland;Sweden" }, { "title": "Generalized Results for the Existence and Consistency of the MLE in the Bradley-Terry-Luce Model", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16301", "id": "16301", "proceeding": "https://proceedings.mlr.press/v162/bong22a.html", "poster": "", "slides": "", "author_site": "Heejong Bong, Alessandro Rinaldo", "author": "Heejong Bong; Alessandro Rinaldo", "abstract": "Ranking problems based on pairwise comparisons, such as those arising in online gaming, often involve a large pool of items to order. In these situations, the gap in performance between any two items can be significant, and the smallest and largest winning probabilities can be very close to zero or one. Furthermore, each item may be compared only to a subset of all the items, so that not all pairwise comparisons are observed. In this paper, we study the performance of the Bradley-Terry-Luce model for ranking from pairwise comparison data under more realistic settings than those considered in the literature so far. In particular, we allow for near-degenerate winning probabilities and arbitrary comparison designs. We obtain novel results about the existence of the maximum likelihood estimator (MLE) and the corresponding $\\ell_2$ estimation error without the bounded winning probability assumption commonly used in the literature and for arbitrary comparison graph topologies. Central to our approach is the reliance on the Fisher information matrix to express the dependence on the graph topologies and the impact of the values of the winning probabilities on the estimation risk and on the conditions for the existence of the MLE. Our bounds recover existing results as special cases but are more broadly applicable.", "bibtex": "@InProceedings{pmlr-v162-bong22a,\n title = \t {Generalized Results for the Existence and Consistency of the {MLE} in the Bradley-Terry-Luce Model},\n author = {Bong, Heejong and Rinaldo, Alessandro},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2160--2177},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bong22a/bong22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bong22a.html},\n abstract = \t {Ranking problems based on pairwise comparisons, such as those arising in online gaming, often involve a large pool of items to order. In these situations, the gap in performance between any two items can be significant, and the smallest and largest winning probabilities can be very close to zero or one. Furthermore, each item may be compared only to a subset of all the items, so that not all pairwise comparisons are observed. In this paper, we study the performance of the Bradley-Terry-Luce model for ranking from pairwise comparison data under more realistic settings than those considered in the literature so far. In particular, we allow for near-degenerate winning probabilities and arbitrary comparison designs. We obtain novel results about the existence of the maximum likelihood estimator (MLE) and the corresponding $\\ell_2$ estimation error without the bounded winning probability assumption commonly used in the literature and for arbitrary comparison graph topologies. Central to our approach is the reliance on the Fisher information matrix to express the dependence on the graph topologies and the impact of the values of the winning probabilities on the estimation risk and on the conditions for the existence of the MLE. Our bounds recover existing results as special cases but are more broadly applicable.}\n}", "pdf": "https://proceedings.mlr.press/v162/bong22a/bong22a.pdf", "supp": "", "pdf_size": 645798, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2724369164015341897&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Statiscis and Data Sciences, Carnegie Mellon University, Pittsburgh, PA, USA; Department of Statiscis and Data Sciences, Carnegie Mellon University, Pittsburgh, PA, USA", "aff_domain": "andrew.cmu.edu;cmu.edu", "email": "andrew.cmu.edu;cmu.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/bong22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "Department of Statistics and Data Sciences", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Generalized Strategic Classification and the Case of Aligned Incentives", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16371", "id": "16371", "proceeding": "https://proceedings.mlr.press/v162/levanon22a.html", "poster": "/media/PosterPDFs/ICML%202022/289dff07669d7a23de0ef88d2f7129e7.png?t=1657709147.5271113", "slides": "", "author_site": "Sagi Levanon, Nir Rosenfeld", "author": "Sagi Levanon; Nir Rosenfeld", "abstract": "Strategic classification studies learning in settings where self-interested users can strategically modify their features to obtain favorable predictive outcomes. A key working assumption, however, is that \u201cfavorable\u201d always means \u201cpositive\u201d; this may be appropriate in some applications (e.g., loan approval), but reduces to a fairly narrow view of what user interests can be. In this work we argue for a broader perspective on what accounts for strategic user behavior, and propose and study a flexible model of generalized strategic classification. Our generalized model subsumes most current models but includes other novel settings; among these, we identify and target one intriguing sub-class of problems in which the interests of users and the system are aligned. This setting reveals a surprising fact: that standard max-margin losses are ill-suited for strategic inputs. Returning to our fully generalized model, we propose a novel max-margin framework for strategic learning that is practical and effective, and which we analyze theoretically. We conclude with a set of experiments that empirically demonstrate the utility of our approach.", "bibtex": "@InProceedings{pmlr-v162-levanon22a,\n title = \t {Generalized Strategic Classification and the Case of Aligned Incentives},\n author = {Levanon, Sagi and Rosenfeld, Nir},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12593--12618},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/levanon22a/levanon22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/levanon22a.html},\n abstract = \t {Strategic classification studies learning in settings where self-interested users can strategically modify their features to obtain favorable predictive outcomes. A key working assumption, however, is that \u201cfavorable\u201d always means \u201cpositive\u201d; this may be appropriate in some applications (e.g., loan approval), but reduces to a fairly narrow view of what user interests can be. In this work we argue for a broader perspective on what accounts for strategic user behavior, and propose and study a flexible model of generalized strategic classification. Our generalized model subsumes most current models but includes other novel settings; among these, we identify and target one intriguing sub-class of problems in which the interests of users and the system are aligned. This setting reveals a surprising fact: that standard max-margin losses are ill-suited for strategic inputs. Returning to our fully generalized model, we propose a novel max-margin framework for strategic learning that is practical and effective, and which we analyze theoretically. We conclude with a set of experiments that empirically demonstrate the utility of our approach.}\n}", "pdf": "https://proceedings.mlr.press/v162/levanon22a/levanon22a.pdf", "supp": "", "pdf_size": 871921, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5634368728411242394&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Faculty of Computer Science, Technion - Israel Institute of Technology, Haifa, Israel; Faculty of Computer Science, Technion - Israel Institute of Technology, Haifa, Israel", "aff_domain": "cs.technion.ac.il;cs.technion.ac.il", "email": "cs.technion.ac.il;cs.technion.ac.il", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/levanon22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "Faculty of Computer Science", "aff_unique_url": "https://www.technion.ac.il", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Haifa", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Generalizing Gaussian Smoothing for Random Search", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16433", "id": "16433", "proceeding": "https://proceedings.mlr.press/v162/gao22f.html", "poster": "/media/PosterPDFs/ICML%202022/a588a6199feff5ba48402883d9b72700.png?t=1657822858.8628252", "slides": "", "author_site": "Katelyn Gao, Ozan Sener", "author": "Katelyn Gao; Ozan Sener", "abstract": "Gaussian smoothing (GS) is a derivative-free optimization (DFO) algorithm that estimates the gradient of an objective using perturbations of the current parameters sampled from a standard normal distribution. We generalize it to sampling perturbations from a larger family of distributions. Based on an analysis of DFO for non-convex functions, we propose to choose a distribution for perturbations that minimizes the mean squared error (MSE) of the gradient estimate. We derive three such distributions with provably smaller MSE than Gaussian smoothing. We conduct evaluations of the three sampling distributions on linear regression, reinforcement learning, and DFO benchmarks in order to validate our claims. Our proposal improves on GS with the same computational complexity, and are competitive with and usually outperform Guided ES and Orthogonal ES, two computationally more expensive algorithms that adapt the covariance matrix of normally distributed perturbations.", "bibtex": "@InProceedings{pmlr-v162-gao22f,\n title = \t {Generalizing {G}aussian Smoothing for Random Search},\n author = {Gao, Katelyn and Sener, Ozan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7077--7101},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gao22f/gao22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/gao22f.html},\n abstract = \t {Gaussian smoothing (GS) is a derivative-free optimization (DFO) algorithm that estimates the gradient of an objective using perturbations of the current parameters sampled from a standard normal distribution. We generalize it to sampling perturbations from a larger family of distributions. Based on an analysis of DFO for non-convex functions, we propose to choose a distribution for perturbations that minimizes the mean squared error (MSE) of the gradient estimate. We derive three such distributions with provably smaller MSE than Gaussian smoothing. We conduct evaluations of the three sampling distributions on linear regression, reinforcement learning, and DFO benchmarks in order to validate our claims. Our proposal improves on GS with the same computational complexity, and are competitive with and usually outperform Guided ES and Orthogonal ES, two computationally more expensive algorithms that adapt the covariance matrix of normally distributed perturbations.}\n}", "pdf": "https://proceedings.mlr.press/v162/gao22f/gao22f.pdf", "supp": "", "pdf_size": 3360836, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2545306041243695019&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Intel Labs, Santa Clara, CA, USA; Intel Labs, Munich, Germany", "aff_domain": "intel.com; ", "email": "intel.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/gao22f.html", "aff_unique_index": "0;0", "aff_unique_norm": "Intel", "aff_unique_dep": "Intel Labs", "aff_unique_url": "https://www.intel.com/research", "aff_unique_abbr": "Intel", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Santa Clara;Munich", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Germany" }, { "title": "Generalizing to Evolving Domains with Latent Structure-Aware Sequential Autoencoder", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18055", "id": "18055", "proceeding": "https://proceedings.mlr.press/v162/qin22a.html", "poster": "/media/PosterPDFs/ICML%202022/f1daf122cde863010844459363cd31db.png?t=1656680414.4127748", "slides": "/media/icml-2022/Slides/18055.pdf", "author_site": "Tiexin QIN, Shiqi Wang, Haoliang Li", "author": "Tiexin Qin; Shiqi Wang; Haoliang Li", "abstract": "Domain generalization aims to improve the generalization capability of machine learning systems to out-of-distribution (OOD) data. Existing domain generalization techniques embark upon stationary and discrete environments to tackle the generalization issue caused by OOD data. However, many real-world tasks in non-stationary environments (e.g., self-driven car system, sensor measures) involve more complex and continuously evolving domain drift, which raises new challenges for the problem of domain generalization. In this paper, we formulate the aforementioned setting as the problem of evolving domain generalization. Specifically, we propose to introduce a probabilistic framework called Latent Structure-aware Sequential Autoencoder (LSSAE) to tackle the problem of evolving domain generalization via exploring the underlying continuous structure in the latent space of deep neural networks, where we aim to identify two major factors namely covariate shift and concept shift accounting for distribution shift in non-stationary environments. Experimental results on both synthetic and real-world datasets show that LSSAE can lead to superior performances based on the evolving domain generalization setting.", "bibtex": "@InProceedings{pmlr-v162-qin22a,\n title = \t {Generalizing to Evolving Domains with Latent Structure-Aware Sequential Autoencoder},\n author = {Qin, Tiexin and Wang, Shiqi and Li, Haoliang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18062--18082},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qin22a/qin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/qin22a.html},\n abstract = \t {Domain generalization aims to improve the generalization capability of machine learning systems to out-of-distribution (OOD) data. Existing domain generalization techniques embark upon stationary and discrete environments to tackle the generalization issue caused by OOD data. However, many real-world tasks in non-stationary environments (e.g., self-driven car system, sensor measures) involve more complex and continuously evolving domain drift, which raises new challenges for the problem of domain generalization. In this paper, we formulate the aforementioned setting as the problem of evolving domain generalization. Specifically, we propose to introduce a probabilistic framework called Latent Structure-aware Sequential Autoencoder (LSSAE) to tackle the problem of evolving domain generalization via exploring the underlying continuous structure in the latent space of deep neural networks, where we aim to identify two major factors namely covariate shift and concept shift accounting for distribution shift in non-stationary environments. Experimental results on both synthetic and real-world datasets show that LSSAE can lead to superior performances based on the evolving domain generalization setting.}\n}", "pdf": "https://proceedings.mlr.press/v162/qin22a/qin22a.pdf", "supp": "", "pdf_size": 3235130, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8021731201291301386&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "City University of Hong Kong; City University of Hong Kong; City University of Hong Kong", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/qin22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "City University of Hong Kong", "aff_unique_dep": "", "aff_unique_url": "https://www.cityu.edu.hk", "aff_unique_abbr": "CityU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Generalizing to New Physical Systems via Context-Informed Dynamics Model", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18049", "id": "18049", "proceeding": "https://proceedings.mlr.press/v162/kirchmeyer22a.html", "poster": "/media/PosterPDFs/ICML%202022/4669d6db6d5b6739b9194e999d907924_TAePVnL.png?t=1656581626.7829597", "slides": "/media/icml-2022/Slides/18049_HKq7MJo.pdf", "author_site": "Matthieu Kirchmeyer, Yuan Yin, J\u00e9r\u00e9mie DONA, Nicolas Baskiotis, alain rakotomamonjy, Patrick Gallinari", "author": "Matthieu Kirchmeyer; Yuan Yin; Jeremie Dona; Nicolas Baskiotis; Alain Rakotomamonjy; Patrick Gallinari", "abstract": "Data-driven approaches to modeling physical systems fail to generalize to unseen systems that share the same general dynamics with the learning domain, but correspond to different physical contexts. We propose a new framework for this key problem, context-informed dynamics adaptation (CoDA), which takes into account the distributional shift across systems for fast and efficient adaptation to new dynamics. CoDA leverages multiple environments, each associated to a different dynamic, and learns to condition the dynamics model on contextual parameters, specific to each environment. The conditioning is performed via a hypernetwork, learned jointly with a context vector from observed data. The proposed formulation constrains the search hypothesis space for fast adaptation and better generalization across environments with few samples. We theoretically motivate our approach and show state-of-the-art generalization results on a set of nonlinear dynamics, representative of a variety of application domains. We also show, on these systems, that new system parameters can be inferred from context vectors with minimal supervision.", "bibtex": "@InProceedings{pmlr-v162-kirchmeyer22a,\n title = \t {Generalizing to New Physical Systems via Context-Informed Dynamics Model},\n author = {Kirchmeyer, Matthieu and Yin, Yuan and Dona, Jeremie and Baskiotis, Nicolas and Rakotomamonjy, Alain and Gallinari, Patrick},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11283--11301},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kirchmeyer22a/kirchmeyer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kirchmeyer22a.html},\n abstract = \t {Data-driven approaches to modeling physical systems fail to generalize to unseen systems that share the same general dynamics with the learning domain, but correspond to different physical contexts. We propose a new framework for this key problem, context-informed dynamics adaptation (CoDA), which takes into account the distributional shift across systems for fast and efficient adaptation to new dynamics. CoDA leverages multiple environments, each associated to a different dynamic, and learns to condition the dynamics model on contextual parameters, specific to each environment. The conditioning is performed via a hypernetwork, learned jointly with a context vector from observed data. The proposed formulation constrains the search hypothesis space for fast adaptation and better generalization across environments with few samples. We theoretically motivate our approach and show state-of-the-art generalization results on a set of nonlinear dynamics, representative of a variety of application domains. We also show, on these systems, that new system parameters can be inferred from context vectors with minimal supervision.}\n}", "pdf": "https://proceedings.mlr.press/v162/kirchmeyer22a/kirchmeyer22a.pdf", "supp": "", "pdf_size": 2426468, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9987364402754968813&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "CNRS-ISIR, Sorbonne University, Paris, France+Criteo AI Lab, Paris, France; CNRS-ISIR, Sorbonne University, Paris, France+Criteo AI Lab, Paris, France; CNRS-ISIR, Sorbonne University, Paris, France; CNRS-ISIR, Sorbonne University, Paris, France; Criteo AI Lab, Paris, France+Universit \u00b4e de Rouen, LITIS, France; CNRS-ISIR, Sorbonne University, Paris, France+Criteo AI Lab, Paris, France", "aff_domain": "sorbonne-universite.fr;sorbonne-universite.fr; ; ; ; ", "email": "sorbonne-universite.fr;sorbonne-universite.fr; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/kirchmeyer22a.html", "aff_unique_index": "0+1;0+1;0;0;1+2;0+1", "aff_unique_norm": "Sorbonne University;Criteo;Universit\u00e9 de Rouen", "aff_unique_dep": "CNRS-ISIR;Criteo AI Lab;LITIS", "aff_unique_url": "https://www.sorbonne.universite.fr;https://www.criteo.com;https://www.univ-rouen.fr", "aff_unique_abbr": "Sorbonne U;Criteo;", "aff_campus_unique_index": "0+0;0+0;0;0;0;0+0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0+0;0+0;0;0;0+0;0+0", "aff_country_unique": "France" }, { "title": "Generating 3D Molecules for Target Protein Binding", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16435", "id": "16435", "proceeding": "https://proceedings.mlr.press/v162/liu22m.html", "poster": "/media/PosterPDFs/ICML%202022/6211080fa89981f66b1a0c9d55c61d0f.png?t=1657661250.1259437", "slides": "/media/icml-2022/Slides/16435.pdf", "author_site": "Meng Liu, Youzhi Luo, Kanji Uchino, Koji Maruhashi, Shuiwang Ji", "author": "Meng Liu; Youzhi Luo; Kanji Uchino; Koji Maruhashi; Shuiwang Ji", "abstract": "A fundamental problem in drug discovery is to design molecules that bind to specific proteins. To tackle this problem using machine learning methods, here we propose a novel and effective framework, known as GraphBP, to generate 3D molecules that bind to given proteins by placing atoms of specific types and locations to the given binding site one by one. In particular, at each step, we first employ a 3D graph neural network to obtain geometry-aware and chemically informative representations from the intermediate contextual information. Such context includes the given binding site and atoms placed in the previous steps. Second, to preserve the desirable equivariance property, we select a local reference atom according to the designed auxiliary classifiers and then construct a local spherical coordinate system. Finally, to place a new atom, we generate its atom type and relative location w.r.t. the constructed local coordinate system via a flow model. We also consider generating the variables of interest sequentially to capture the underlying dependencies among them. Experiments demonstrate that our GraphBP is effective to generate 3D molecules with binding ability to target protein binding sites. Our implementation is available at https://github.com/divelab/GraphBP.", "bibtex": "@InProceedings{pmlr-v162-liu22m,\n title = \t {Generating 3{D} Molecules for Target Protein Binding},\n author = {Liu, Meng and Luo, Youzhi and Uchino, Kanji and Maruhashi, Koji and Ji, Shuiwang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13912--13924},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22m/liu22m.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22m.html},\n abstract = \t {A fundamental problem in drug discovery is to design molecules that bind to specific proteins. To tackle this problem using machine learning methods, here we propose a novel and effective framework, known as GraphBP, to generate 3D molecules that bind to given proteins by placing atoms of specific types and locations to the given binding site one by one. In particular, at each step, we first employ a 3D graph neural network to obtain geometry-aware and chemically informative representations from the intermediate contextual information. Such context includes the given binding site and atoms placed in the previous steps. Second, to preserve the desirable equivariance property, we select a local reference atom according to the designed auxiliary classifiers and then construct a local spherical coordinate system. Finally, to place a new atom, we generate its atom type and relative location w.r.t. the constructed local coordinate system via a flow model. We also consider generating the variables of interest sequentially to capture the underlying dependencies among them. Experiments demonstrate that our GraphBP is effective to generate 3D molecules with binding ability to target protein binding sites. Our implementation is available at https://github.com/divelab/GraphBP.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22m/liu22m.pdf", "supp": "", "pdf_size": 7534013, "gs_citation": 162, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5832718815392405433&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science & Engineering, Texas A&M University, TX, USA; Department of Computer Science & Engineering, Texas A&M University, TX, USA; Fujitsu Research of America, INC., CA, USA; Fujitsu Research, Fujitsu Limited, Kanagawa, Japan; Department of Computer Science & Engineering, Texas A&M University, TX, USA", "aff_domain": "tamu.edu; ; ; ;tamu.edu", "email": "tamu.edu; ; ; ;tamu.edu", "github": "https://github.com/divelab/GraphBP", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/liu22m.html", "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Texas A&M University;Fujitsu Research of America, INC.;Fujitsu Limited", "aff_unique_dep": "Department of Computer Science & Engineering;;Fujitsu Research", "aff_unique_url": "https://www.tamu.edu;https://www.fujitsu.com/us/;https://www.fujitsu.com", "aff_unique_abbr": "TAMU;FRA;Fujitsu", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "TX;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;Japan" }, { "title": "Generating Distributional Adversarial Examples to Evade Statistical Detectors", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17325", "id": "17325", "proceeding": "https://proceedings.mlr.press/v162/kaya22a.html", "poster": "/media/PosterPDFs/ICML%202022/e148bbf8d64abf4aac7ea4a3c5560aee_nClDHaA.png?t=1658161054.8070219", "slides": "", "author_site": "Yigitcan Kaya, Muhammad Bilal Zafar, Sergul Aydore, Nathalie Rauschmayr, Krishnaram Kenthapadi", "author": "Yigitcan Kaya; Muhammad Bilal Zafar; Sergul Aydore; Nathalie Rauschmayr; Krishnaram Kenthapadi", "abstract": "Deep neural networks (DNNs) are known to be highly vulnerable to adversarial examples (AEs) that include malicious perturbations. Assumptions about the statistical differences between natural and adversarial inputs are commonplace in many detection techniques. As a best practice, AE detectors are evaluated against \u2019adaptive\u2019 attackers who actively perturb their inputs to avoid detection. Due to the difficulties in designing adaptive attacks, however, recent work suggests that most detectors have incomplete evaluation. We aim to fill this gap by designing a generic adaptive attack against detectors: the \u2019statistical indistinguishability attack\u2019 (SIA). SIA optimizes a novel objective to craft adversarial examples (AEs) that follow the same distribution as the natural inputs with respect to DNN representations. Our objective targets all DNN layers simultaneously as we show that AEs being indistinguishable at one layer might fail to be so at other layers. SIA is formulated around evading distributional detectors that inspect a set of AEs as a whole and is also effective against four individual AE detectors, two dataset shift detectors, and an out-of-distribution sample detector, curated from published works. This suggests that SIA can be a reliable tool for evaluating the security of a range of detectors.", "bibtex": "@InProceedings{pmlr-v162-kaya22a,\n title = \t {Generating Distributional Adversarial Examples to Evade Statistical Detectors},\n author = {Kaya, Yigitcan and Zafar, Muhammad Bilal and Aydore, Sergul and Rauschmayr, Nathalie and Kenthapadi, Krishnaram},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10895--10911},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kaya22a/kaya22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kaya22a.html},\n abstract = \t {Deep neural networks (DNNs) are known to be highly vulnerable to adversarial examples (AEs) that include malicious perturbations. Assumptions about the statistical differences between natural and adversarial inputs are commonplace in many detection techniques. As a best practice, AE detectors are evaluated against \u2019adaptive\u2019 attackers who actively perturb their inputs to avoid detection. Due to the difficulties in designing adaptive attacks, however, recent work suggests that most detectors have incomplete evaluation. We aim to fill this gap by designing a generic adaptive attack against detectors: the \u2019statistical indistinguishability attack\u2019 (SIA). SIA optimizes a novel objective to craft adversarial examples (AEs) that follow the same distribution as the natural inputs with respect to DNN representations. Our objective targets all DNN layers simultaneously as we show that AEs being indistinguishable at one layer might fail to be so at other layers. SIA is formulated around evading distributional detectors that inspect a set of AEs as a whole and is also effective against four individual AE detectors, two dataset shift detectors, and an out-of-distribution sample detector, curated from published works. This suggests that SIA can be a reliable tool for evaluating the security of a range of detectors.}\n}", "pdf": "https://proceedings.mlr.press/v162/kaya22a/kaya22a.pdf", "supp": "", "pdf_size": 11533159, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7966460423779535669&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "University of Maryland College Park; Amazon Web Services; Amazon Web Services; Amazon Web Services; Fiddler AI", "aff_domain": "umd.edu; ; ; ; ", "email": "umd.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/kaya22a.html", "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "University of Maryland;Amazon;Fiddler AI", "aff_unique_dep": ";Amazon Web Services;", "aff_unique_url": "https://www/umd.edu;https://aws.amazon.com;https://www.fiddler.ai", "aff_unique_abbr": "UMD;AWS;Fiddler AI", "aff_campus_unique_index": "0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Generative Coarse-Graining of Molecular Conformations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16717", "id": "16717", "proceeding": "https://proceedings.mlr.press/v162/wang22ag.html", "poster": "/media/PosterPDFs/ICML%202022/0c1c995b77ea7312f887ddd9f9d35de5.png?t=1657840755.3179142", "slides": "", "author_site": "Wujie Wang, Minkai Xu, Chen Cai, Benjamin Kurt Miller, Tess Smidt, Yusu Wang, Jian Tang, Rafael Gomez-Bombarelli", "author": "Wujie Wang; Minkai Xu; Chen Cai; Benjamin K Miller; Tess Smidt; Yusu Wang; Jian Tang; Rafael Gomez-Bombarelli", "abstract": "Coarse-graining (CG) of molecular simulations simplifies the particle representation by grouping selected atoms into pseudo-beads and therefore drastically accelerates simulation. However, such CG procedure induces information losses, which makes accurate backmapping, i.e., restoring fine-grained (FG) coordinates from CG coordinates, a long-standing challenge. Inspired by the recent progress in generative models and equivariant networks, we propose a novel model that rigorously embeds the vital probabilistic nature and geometrical consistency requirements of the backmapping transformation. Our model encodes the FG uncertainties into an invariant latent space and decodes them back to FG geometries via equivariant convolutions. To standardize the evaluation of this domain, we further provide three comprehensive benchmarks based on molecular dynamics trajectories. Extensive experiments show that our approach always recovers more realistic structures and outperforms existing data-driven methods with a significant margin.", "bibtex": "@InProceedings{pmlr-v162-wang22ag,\n title = \t {Generative Coarse-Graining of Molecular Conformations},\n author = {Wang, Wujie and Xu, Minkai and Cai, Chen and Miller, Benjamin K and Smidt, Tess and Wang, Yusu and Tang, Jian and Gomez-Bombarelli, Rafael},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23213--23236},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22ag/wang22ag.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22ag.html},\n abstract = \t {Coarse-graining (CG) of molecular simulations simplifies the particle representation by grouping selected atoms into pseudo-beads and therefore drastically accelerates simulation. However, such CG procedure induces information losses, which makes accurate backmapping, i.e., restoring fine-grained (FG) coordinates from CG coordinates, a long-standing challenge. Inspired by the recent progress in generative models and equivariant networks, we propose a novel model that rigorously embeds the vital probabilistic nature and geometrical consistency requirements of the backmapping transformation. Our model encodes the FG uncertainties into an invariant latent space and decodes them back to FG geometries via equivariant convolutions. To standardize the evaluation of this domain, we further provide three comprehensive benchmarks based on molecular dynamics trajectories. Extensive experiments show that our approach always recovers more realistic structures and outperforms existing data-driven methods with a significant margin.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22ag/wang22ag.pdf", "supp": "", "pdf_size": 11638534, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6589570772523921711&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "email": ";;;;;;;", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/wang22ag.html" }, { "title": "Generative Cooperative Networks for Natural Language Generation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18131", "id": "18131", "proceeding": "https://proceedings.mlr.press/v162/lamprier22a.html", "poster": "/media/PosterPDFs/ICML%202022/c8cd63e1bf13c5016881652983fb615a.png?t=1657209247.9771724", "slides": "", "author_site": "Sylvain Lamprier, Thomas Scialom, Antoine Chaffin, Vincent Claveau, Ewa Kijak, Jacopo Staiano, Benjamin Piwowarski", "author": "Sylvain Lamprier; Thomas Scialom; Antoine Chaffin; Vincent Claveau; Ewa Kijak; Jacopo Staiano; Benjamin Piwowarski", "abstract": "Generative Adversarial Networks (GANs) have known a tremendous success for many continuous generation tasks, especially in the field of image generation. However, for discrete outputs such as language, optimizing GANs remains an open problem with many instabilities, as no gradient can be properly back-propagated from the discriminator output to the generator parameters. An alternative is to learn the generator network via reinforcement learning, using the discriminator signal as a reward, but such a technique suffers from moving rewards and vanishing gradient problems. Finally, it often falls short compared to direct maximum-likelihood approaches. In this paper, we introduce Generative Cooperative Networks, in which the discriminator architecture is cooperatively used along with the generation policy to output samples of realistic texts for the task at hand. We give theoretical guarantees of convergence for our approach, and study various efficient decoding schemes to empirically achieve state-of-the-art results in two main NLG tasks.", "bibtex": "@InProceedings{pmlr-v162-lamprier22a,\n title = \t {Generative Cooperative Networks for Natural Language Generation},\n author = {Lamprier, Sylvain and Scialom, Thomas and Chaffin, Antoine and Claveau, Vincent and Kijak, Ewa and Staiano, Jacopo and Piwowarski, Benjamin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11891--11905},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lamprier22a/lamprier22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lamprier22a.html},\n abstract = \t {Generative Adversarial Networks (GANs) have known a tremendous success for many continuous generation tasks, especially in the field of image generation. However, for discrete outputs such as language, optimizing GANs remains an open problem with many instabilities, as no gradient can be properly back-propagated from the discriminator output to the generator parameters. An alternative is to learn the generator network via reinforcement learning, using the discriminator signal as a reward, but such a technique suffers from moving rewards and vanishing gradient problems. Finally, it often falls short compared to direct maximum-likelihood approaches. In this paper, we introduce Generative Cooperative Networks, in which the discriminator architecture is cooperatively used along with the generation policy to output samples of realistic texts for the task at hand. We give theoretical guarantees of convergence for our approach, and study various efficient decoding schemes to empirically achieve state-of-the-art results in two main NLG tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/lamprier22a/lamprier22a.pdf", "supp": "", "pdf_size": 637963, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=872972713061862465&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "aff": "ISIR - Sorbonne Universit\u00e9, Paris, France+ReciTAL, Paris, France; ISIR - Sorbonne Universit\u00e9, Paris, France+IRISA, Rennes, France; IRISA, Rennes, France+IMATAG, Rennes, France; IRISA, Rennes, France+CNRS; IRISA, Rennes, France; ReciTAL, Paris, France; ISIR - Sorbonne Universit\u00e9, Paris, France+CNRS", "aff_domain": "isir.upmc.fr; ; ; ; ; ; ", "email": "isir.upmc.fr; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/lamprier22a.html", "aff_unique_index": "0+1;0+2;2+3;2+4;2;1;0+4", "aff_unique_norm": "Sorbonne Universit\u00e9;reciTAL;Institut de Recherche en Informatique et Syst\u00e8mes Al\u00e9atoires;IMATAG;Centre National de la Recherche Scientifique", "aff_unique_dep": "Institut des Sciences de l'Ing\u00e9nierie de Robotique;;;;", "aff_unique_url": "https://www.sorbonne-universite.fr;;https://www.irisa.fr;;https://www.cnrs.fr", "aff_unique_abbr": "Sorbonne U;;IRISA;;CNRS", "aff_campus_unique_index": "0+0;0+1;1+1;1;1;0;0", "aff_campus_unique": "Paris;Rennes;", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0;0;0+0", "aff_country_unique": "France" }, { "title": "Generative Flow Networks for Discrete Probabilistic Modeling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17135", "id": "17135", "proceeding": "https://proceedings.mlr.press/v162/zhang22v.html", "poster": "/media/PosterPDFs/ICML%202022/efe937780e95574250dabe07151bdc23.png?t=1657632177.3072882", "slides": "", "author_site": "Dinghuai Zhang, Nikolay Malkin, Zhen Liu, Alexandra Volokhova, Aaron Courville, Yoshua Bengio", "author": "Dinghuai Zhang; Nikolay Malkin; Zhen Liu; Alexandra Volokhova; Aaron Courville; Yoshua Bengio", "abstract": "We present energy-based generative flow networks (EB-GFN), a novel probabilistic modeling algorithm for high-dimensional discrete data. Building upon the theory of generative flow networks (GFlowNets), we model the generation process by a stochastic data construction policy and thus amortize expensive MCMC exploration into a fixed number of actions sampled from a GFlowNet. We show how GFlowNets can approximately perform large-block Gibbs sampling to mix between modes. We propose a framework to jointly train a GFlowNet with an energy function, so that the GFlowNet learns to sample from the energy distribution, while the energy learns with an approximate MLE objective with negative samples from the GFlowNet. We demonstrate EB-GFN\u2019s effectiveness on various probabilistic modeling tasks. Code is publicly available at https://github.com/zdhNarsil/EB_GFN.", "bibtex": "@InProceedings{pmlr-v162-zhang22v,\n title = \t {Generative Flow Networks for Discrete Probabilistic Modeling},\n author = {Zhang, Dinghuai and Malkin, Nikolay and Liu, Zhen and Volokhova, Alexandra and Courville, Aaron and Bengio, Yoshua},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26412--26428},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22v/zhang22v.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22v.html},\n abstract = \t {We present energy-based generative flow networks (EB-GFN), a novel probabilistic modeling algorithm for high-dimensional discrete data. Building upon the theory of generative flow networks (GFlowNets), we model the generation process by a stochastic data construction policy and thus amortize expensive MCMC exploration into a fixed number of actions sampled from a GFlowNet. We show how GFlowNets can approximately perform large-block Gibbs sampling to mix between modes. We propose a framework to jointly train a GFlowNet with an energy function, so that the GFlowNet learns to sample from the energy distribution, while the energy learns with an approximate MLE objective with negative samples from the GFlowNet. We demonstrate EB-GFN\u2019s effectiveness on various probabilistic modeling tasks. Code is publicly available at https://github.com/zdhNarsil/EB_GFN.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22v/zhang22v.pdf", "supp": "", "pdf_size": 4877768, "gs_citation": 120, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5719959167998853445&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Mila - Quebec AI Institute and Universit\u00e9 de Montr\u00e9al, Montreal, Quebec, Canada; Mila - Quebec AI Institute and Universit\u00e9 de Montr\u00e9al, Montreal, Quebec, Canada; Mila - Quebec AI Institute and Universit\u00e9 de Montr\u00e9al, Montreal, Quebec, Canada; Mila - Quebec AI Institute and Universit\u00e9 de Montr\u00e9al, Montreal, Quebec, Canada; Mila - Quebec AI Institute and Universit\u00e9 de Montr\u00e9al, Montreal, Quebec, Canada; Mila - Quebec AI Institute and Universit\u00e9 de Montr\u00e9al, Montreal, Quebec, Canada", "aff_domain": "mila.quebec; ; ; ; ; ", "email": "mila.quebec; ; ; ; ; ", "github": "github.com/zdhNarsil/EB GFN", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhang22v.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": "Quebec AI Institute", "aff_unique_url": "https://www.umontreal.ca", "aff_unique_abbr": "UdeM", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Montreal", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Generative Modeling for Multi-task Visual Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17651", "id": "17651", "proceeding": "https://proceedings.mlr.press/v162/bao22c.html", "poster": "/media/PosterPDFs/ICML%202022/b80ba73857eed2a36dc7640e2310055a_W1hpgCf.png?t=1656549722.3852196", "slides": "/media/icml-2022/Slides/17651.pdf", "author_site": "Zhipeng Bao, Martial Hebert, Yu-Xiong Wang", "author": "Zhipeng Bao; Martial Hebert; Yu-Xiong Wang", "abstract": "Generative modeling has recently shown great promise in computer vision, but it has mostly focused on synthesizing visually realistic images. In this paper, motivated by multi-task learning of shareable feature representations, we consider a novel problem of learning a shared generative model that is useful across various visual perception tasks. Correspondingly, we propose a general multi-task oriented generative modeling (MGM) framework, by coupling a discriminative multi-task network with a generative network. While it is challenging to synthesize both RGB images and pixel-level annotations in multi-task scenarios, our framework enables us to use synthesized images paired with only weak annotations (i.e., image-level scene labels) to facilitate multiple visual tasks. Experimental evaluation on challenging multi-task benchmarks, including NYUv2 and Taskonomy, demonstrates that our MGM framework improves the performance of all the tasks by large margins, consistently outperforming state-of-the-art multi-task approaches in different sample-size regimes.", "bibtex": "@InProceedings{pmlr-v162-bao22c,\n title = \t {Generative Modeling for Multi-task Visual Learning},\n author = {Bao, Zhipeng and Hebert, Martial and Wang, Yu-Xiong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1537--1554},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bao22c/bao22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/bao22c.html},\n abstract = \t {Generative modeling has recently shown great promise in computer vision, but it has mostly focused on synthesizing visually realistic images. In this paper, motivated by multi-task learning of shareable feature representations, we consider a novel problem of learning a shared generative model that is useful across various visual perception tasks. Correspondingly, we propose a general multi-task oriented generative modeling (MGM) framework, by coupling a discriminative multi-task network with a generative network. While it is challenging to synthesize both RGB images and pixel-level annotations in multi-task scenarios, our framework enables us to use synthesized images paired with only weak annotations (i.e., image-level scene labels) to facilitate multiple visual tasks. Experimental evaluation on challenging multi-task benchmarks, including NYUv2 and Taskonomy, demonstrates that our MGM framework improves the performance of all the tasks by large margins, consistently outperforming state-of-the-art multi-task approaches in different sample-size regimes.}\n}", "pdf": "https://proceedings.mlr.press/v162/bao22c/bao22c.pdf", "supp": "", "pdf_size": 6088120, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7184993398931022761&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Carnegie Mellon University; Carnegie Mellon University; University of Illinois at Urbana-Champaign", "aff_domain": "cs.cmu.edu;cs.cmu.edu;illinois.edu", "email": "cs.cmu.edu;cs.cmu.edu;illinois.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/bao22c.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Carnegie Mellon University;University of Illinois Urbana-Champaign", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://illinois.edu", "aff_unique_abbr": "CMU;UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Generative Trees: Adversarial and Copycat", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15989", "id": "15989", "proceeding": "https://proceedings.mlr.press/v162/nock22a.html", "poster": "/media/PosterPDFs/ICML%202022/8fb21ee7a2207526da55a679f0332de2.png?t=1657532198.1219916", "slides": "", "author_site": "Richard Nock, Mathieu Guillame-Bert", "author": "Richard Nock; Mathieu Guillame-Bert", "abstract": "While Generative Adversarial Networks (GANs) achieve spectacular results on unstructured data like images, there is still a gap on", "bibtex": "@InProceedings{pmlr-v162-nock22a,\n title = \t {Generative Trees: Adversarial and Copycat},\n author = {Nock, Richard and Guillame-Bert, Mathieu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16906--16951},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nock22a/nock22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nock22a.html},\n abstract = \t {While Generative Adversarial Networks (GANs) achieve spectacular results on unstructured data like images, there is still a gap on", "pdf": "https://proceedings.mlr.press/v162/nock22a/nock22a.pdf", "supp": "", "pdf_size": 13804386, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8681008047629310650&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Google Research; Google Research", "aff_domain": "google.com; ", "email": "google.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/nock22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Generic Coreset for Scalable Learning of Monotonic Kernels: Logistic Regression, Sigmoid and more", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16169", "id": "16169", "proceeding": "https://proceedings.mlr.press/v162/tolochinksy22a.html", "poster": "/media/PosterPDFs/ICML%202022/f457c545a9ded88f18ecee47145a72c0.png?t=1657436641.1555421", "slides": "", "author_site": "Elad Tolochinksy, Ibrahim Jubran, Dan Feldman", "author": "Elad Tolochinksy; Ibrahim Jubran; Dan Feldman", "abstract": "Coreset (or core-set) is a small weighted", "bibtex": "@InProceedings{pmlr-v162-tolochinksy22a,\n title = \t {Generic Coreset for Scalable Learning of Monotonic Kernels: Logistic Regression, Sigmoid and more},\n author = {Tolochinksy, Elad and Jubran, Ibrahim and Feldman, Dan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21520--21547},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tolochinksy22a/tolochinksy22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tolochinksy22a.html},\n abstract = \t {Coreset (or core-set) is a small weighted", "pdf": "https://proceedings.mlr.press/v162/tolochinksy22a/tolochinksy22a.pdf", "supp": "", "pdf_size": 843410, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8709913574702811172&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Robotics & Big Data Labs, Computer Science Department, University of Haifa, Israel; Robotics & Big Data Labs, Computer Science Department, University of Haifa, Israel; Robotics & Big Data Labs, Computer Science Department, University of Haifa, Israel", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/tolochinksy22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Haifa", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www.haifa.ac.il", "aff_unique_abbr": "UoH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Israel" }, { "title": "Geometric Multimodal Contrastive Representation Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16227", "id": "16227", "proceeding": "https://proceedings.mlr.press/v162/poklukar22a.html", "poster": "/media/PosterPDFs/ICML%202022/be767243ca8f574c740fb4c26cc6dceb.png?t=1657790350.3372085", "slides": "", "author_site": "Petra Poklukar, Miguel Vasco, Hang Yin, Francisco S. Melo, Ana Paiva, Danica Kragic", "author": "Petra Poklukar; Miguel Vasco; Hang Yin; Francisco S. Melo; Ana Paiva; Danica Kragic", "abstract": "Learning representations of multimodal data that are both informative and robust to missing modalities at test time remains a challenging problem due to the inherent heterogeneity of data obtained from different channels. To address it, we present a novel Geometric Multimodal Contrastive (GMC) representation learning method consisting of two main components: i) a two-level architecture consisting of modality-specific base encoders, allowing to process an arbitrary number of modalities to an intermediate representation of fixed dimensionality, and a shared projection head, mapping the intermediate representations to a latent representation space; ii) a multimodal contrastive loss function that encourages the geometric alignment of the learned representations. We experimentally demonstrate that GMC representations are semantically rich and achieve state-of-the-art performance with missing modality information on three different learning problems including prediction and reinforcement learning tasks.", "bibtex": "@InProceedings{pmlr-v162-poklukar22a,\n title = \t {Geometric Multimodal Contrastive Representation Learning},\n author = {Poklukar, Petra and Vasco, Miguel and Yin, Hang and Melo, Francisco S. and Paiva, Ana and Kragic, Danica},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17782--17800},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/poklukar22a/poklukar22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/poklukar22a.html},\n abstract = \t {Learning representations of multimodal data that are both informative and robust to missing modalities at test time remains a challenging problem due to the inherent heterogeneity of data obtained from different channels. To address it, we present a novel Geometric Multimodal Contrastive (GMC) representation learning method consisting of two main components: i) a two-level architecture consisting of modality-specific base encoders, allowing to process an arbitrary number of modalities to an intermediate representation of fixed dimensionality, and a shared projection head, mapping the intermediate representations to a latent representation space; ii) a multimodal contrastive loss function that encourages the geometric alignment of the learned representations. We experimentally demonstrate that GMC representations are semantically rich and achieve state-of-the-art performance with missing modality information on three different learning problems including prediction and reinforcement learning tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/poklukar22a/poklukar22a.pdf", "supp": "", "pdf_size": 1283757, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1723737180667149201&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/poklukar22a.html" }, { "title": "Global Optimization Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16107", "id": "16107", "proceeding": "https://proceedings.mlr.press/v162/zhao22f.html", "poster": "/media/PosterPDFs/ICML%202022/c4ca4238a0b923820dcc509a6f75849b_OnslqzG.png?t=1658245504.9654973", "slides": "", "author_site": "Sen Zhao, Erez Louidor, Maya Gupta", "author": "Sen Zhao; Erez Louidor; Maya Gupta", "abstract": "We consider the problem of estimating a good maximizer of a black-box function given noisy examples. We propose to fit a new type of function called a global optimization network (GON), defined as any composition of an invertible function and a unimodal function, whose unique global maximizer can be inferred in $\\mathcal{O}(D)$ time, and used as the estimate. As an example way to construct GON functions, and interesting in its own right, we give new results for specifying multi-dimensional unimodal functions using lattice models with linear inequality constraints. We extend to", "bibtex": "@InProceedings{pmlr-v162-zhao22f,\n title = \t {Global Optimization Networks},\n author = {Zhao, Sen and Louidor, Erez and Gupta, Maya},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26927--26957},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhao22f/zhao22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhao22f.html},\n abstract = \t {We consider the problem of estimating a good maximizer of a black-box function given noisy examples. We propose to fit a new type of function called a global optimization network (GON), defined as any composition of an invertible function and a unimodal function, whose unique global maximizer can be inferred in $\\mathcal{O}(D)$ time, and used as the estimate. As an example way to construct GON functions, and interesting in its own right, we give new results for specifying multi-dimensional unimodal functions using lattice models with linear inequality constraints. We extend to", "pdf": "https://proceedings.mlr.press/v162/zhao22f/zhao22f.pdf", "supp": "", "pdf_size": 2411317, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5304315037379165712&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Google Research, Mountain View, CA 94043 USA; Google Research, Mountain View, CA 94043 USA; Google Research, Mountain View, CA 94043 USA", "aff_domain": "google.com; ; ", "email": "google.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhao22f.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Global Optimization of K-Center Clustering", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17785", "id": "17785", "proceeding": "https://proceedings.mlr.press/v162/shi22b.html", "poster": "/media/PosterPDFs/ICML%202022/88340338e0a0cdc54350c05cf056dca8.png?t=1656742575.6246634", "slides": "", "author_site": "Mingfei Shi, Kaixun Hua, Jiayang Ren, Yankai Cao", "author": "Mingfei Shi; Kaixun Hua; Jiayang Ren; Yankai Cao", "abstract": "$k$-center problem is a well-known clustering method and can be formulated as a mixed-integer nonlinear programming problem. This work provides a practical global optimization algorithm for this task based on a reduced-space spatial branch and bound scheme. This algorithm can guarantee convergence to the global optimum by only branching on the centers of clusters, which is independent of the dataset\u2019s cardinality. In addition, a set of feasibility-based bounds tightening techniques are proposed to narrow down the domain of centers and significantly accelerate the convergence. To demonstrate the capacity of this algorithm, we present computational results on 32 datasets. Notably, for the dataset with 14 million samples and 3 features, the serial implementation of the algorithm can converge to an optimality gap of 0.1% within 2 hours. Compared with a heuristic method, the global optimum obtained by our algorithm can reduce the objective function on average by 30.4%.", "bibtex": "@InProceedings{pmlr-v162-shi22b,\n title = \t {Global Optimization of K-Center Clustering},\n author = {Shi, Mingfei and Hua, Kaixun and Ren, Jiayang and Cao, Yankai},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19956--19966},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shi22b/shi22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/shi22b.html},\n abstract = \t {$k$-center problem is a well-known clustering method and can be formulated as a mixed-integer nonlinear programming problem. This work provides a practical global optimization algorithm for this task based on a reduced-space spatial branch and bound scheme. This algorithm can guarantee convergence to the global optimum by only branching on the centers of clusters, which is independent of the dataset\u2019s cardinality. In addition, a set of feasibility-based bounds tightening techniques are proposed to narrow down the domain of centers and significantly accelerate the convergence. To demonstrate the capacity of this algorithm, we present computational results on 32 datasets. Notably, for the dataset with 14 million samples and 3 features, the serial implementation of the algorithm can converge to an optimality gap of 0.1% within 2 hours. Compared with a heuristic method, the global optimum obtained by our algorithm can reduce the objective function on average by 30.4%.}\n}", "pdf": "https://proceedings.mlr.press/v162/shi22b/shi22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/shi22b-supp.zip", "pdf_size": 408999, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3518043969021402453&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Department of Chemical and Biological Engineering, University of British Columbia, Vancouver, British Columbia, Canada; Department of Chemical and Biological Engineering, University of British Columbia, Vancouver, British Columbia, Canada; Department of Chemical and Biological Engineering, University of British Columbia, Vancouver, British Columbia, Canada; Department of Chemical and Biological Engineering, University of British Columbia, Vancouver, British Columbia, Canada", "aff_domain": "ubc.ca;ubc.ca;ubc.ca;ubc.ca", "email": "ubc.ca;ubc.ca;ubc.ca;ubc.ca", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/shi22b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of British Columbia", "aff_unique_dep": "Department of Chemical and Biological Engineering", "aff_unique_url": "https://www.ubc.ca", "aff_unique_abbr": "UBC", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Vancouver", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Goal Misgeneralization in Deep Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18379", "id": "18379", "proceeding": "https://proceedings.mlr.press/v162/langosco22a.html", "poster": "/media/PosterPDFs/ICML%202022/faefec47428cf9a2f0875ba9c2042a81.png?t=1658245228.29595", "slides": "", "author_site": "Lauro Langosco di Langosco, Jack Koch, Lee Sharkey, Jacob Pfau, David Krueger", "author": "Lauro Langosco Di Langosco; Jack Koch; Lee D Sharkey; Jacob Pfau; David Krueger", "abstract": "We study", "bibtex": "@InProceedings{pmlr-v162-langosco22a,\n title = \t {Goal Misgeneralization in Deep Reinforcement Learning},\n author = {Langosco, Lauro Langosco Di and Koch, Jack and Sharkey, Lee D and Pfau, Jacob and Krueger, David},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12004--12019},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/langosco22a/langosco22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/langosco22a.html},\n abstract = \t {We study", "pdf": "https://proceedings.mlr.press/v162/langosco22a/langosco22a.pdf", "supp": "", "pdf_size": 4355011, "gs_citation": 141, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10141783547784407570&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "University of Cambridge; University of Cambridge; University of T\u00fcbingen; University of Edinburgh; University of Cambridge", "aff_domain": "gmail.com; ; ; ; ", "email": "gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/langosco22a.html", "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "University of Cambridge;University of T\u00fcbingen;University of Edinburgh", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cam.ac.uk;https://www.uni-tuebingen.de/;https://www.ed.ac.uk", "aff_unique_abbr": "Cambridge;Uni T\u00fcbingen;Edinburgh", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United Kingdom;Germany" }, { "title": "Going Deeper into Permutation-Sensitive Graph Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16659", "id": "16659", "proceeding": "https://proceedings.mlr.press/v162/huang22l.html", "poster": "/media/PosterPDFs/ICML%202022/ca9c267dad0305d1a6308d2a0cf1c39c_LE1e9rw.png?t=1658043405.0704465", "slides": "/media/icml-2022/Slides/16659_urBfEKz.pdf", "author_site": "Zhongyu Huang, Yingheng Wang, Chaozhuo Li, Huiguang He", "author": "Zhongyu Huang; Yingheng Wang; Chaozhuo Li; Huiguang He", "abstract": "The invariance to permutations of the adjacency matrix, i.e., graph isomorphism, is an overarching requirement for Graph Neural Networks (GNNs). Conventionally, this prerequisite can be satisfied by the invariant operations over node permutations when aggregating messages. However, such an invariant manner may ignore the relationships among neighboring nodes, thereby hindering the expressivity of GNNs. In this work, we devise an efficient permutation-sensitive aggregation mechanism via permutation groups, capturing pairwise correlations between neighboring nodes. We prove that our approach is strictly more powerful than the 2-dimensional Weisfeiler-Lehman (2-WL) graph isomorphism test and not less powerful than the 3-WL test. Moreover, we prove that our approach achieves the linear sampling complexity. Comprehensive experiments on multiple synthetic and real-world datasets demonstrate the superiority of our model.", "bibtex": "@InProceedings{pmlr-v162-huang22l,\n title = \t {Going Deeper into Permutation-Sensitive Graph Neural Networks},\n author = {Huang, Zhongyu and Wang, Yingheng and Li, Chaozhuo and He, Huiguang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9377--9409},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22l/huang22l.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22l.html},\n abstract = \t {The invariance to permutations of the adjacency matrix, i.e., graph isomorphism, is an overarching requirement for Graph Neural Networks (GNNs). Conventionally, this prerequisite can be satisfied by the invariant operations over node permutations when aggregating messages. However, such an invariant manner may ignore the relationships among neighboring nodes, thereby hindering the expressivity of GNNs. In this work, we devise an efficient permutation-sensitive aggregation mechanism via permutation groups, capturing pairwise correlations between neighboring nodes. We prove that our approach is strictly more powerful than the 2-dimensional Weisfeiler-Lehman (2-WL) graph isomorphism test and not less powerful than the 3-WL test. Moreover, we prove that our approach achieves the linear sampling complexity. Comprehensive experiments on multiple synthetic and real-world datasets demonstrate the superiority of our model.}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22l/huang22l.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/huang22l-supp.zip", "pdf_size": 1041018, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14997369349376020515&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China+School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China; Department of Electronic Engineering, Tsinghua University, Beijing, China+Department of Biomedical Engineering, Johns Hopkins University, Baltimore, Maryland, USA; Microsoft Research Asia, Beijing, China; Center for Excellence in Brain Science and Intelligence Technology, Chinese Academy of Sciences, Beijing, China+School of Artificial Intelligence, University of Chinese Academy of Sciences, Beijing, China", "aff_domain": "ia.ac.cn; ; ;ia.ac.cn", "email": "ia.ac.cn; ; ;ia.ac.cn", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/huang22l.html", "aff_unique_index": "0+1;2+3;4;0+1", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Tsinghua University;Johns Hopkins University;Microsoft", "aff_unique_dep": "Institute of Automation;School of Artificial Intelligence;Department of Electronic Engineering;Department of Biomedical Engineering;Research", "aff_unique_url": "http://www.ia.cas.cn;http://www.ucas.ac.cn;https://www.tsinghua.edu.cn;https://www.jhu.edu;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "CAS;UCAS;THU;JHU;MSRA", "aff_campus_unique_index": "0+0;0+1;0;0+0", "aff_campus_unique": "Beijing;Baltimore", "aff_country_unique_index": "0+0;0+1;0;0+0", "aff_country_unique": "China;United States" }, { "title": "Gradient Based Clustering", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17663", "id": "17663", "proceeding": "https://proceedings.mlr.press/v162/armacki22a.html", "poster": "/media/PosterPDFs/ICML%202022/6b620aedfa4cf153467265629501dd61.png?t=1657908146.735701", "slides": "/media/icml-2022/Slides/17663.pdf", "author_site": "Aleksandar Armacki, Dragana Bajovic, Dusan Jakovetic, Soummya Kar", "author": "Aleksandar Armacki; Dragana Bajovic; Dusan Jakovetic; Soummya Kar", "abstract": "We propose a general approach for distance based clustering, using the gradient of the cost function that measures clustering quality with respect to cluster assignments and cluster center positions. The approach is an iterative two step procedure (alternating between cluster assignment and cluster center updates) and is applicable to a wide range of functions, satisfying some mild assumptions. The main advantage of the proposed approach is a simple and computationally cheap update rule. Unlike previous methods that specialize to a specific formulation of the clustering problem, our approach is applicable to a wide range of costs, including non-Bregman clustering methods based on the Huber loss. We analyze the convergence of the proposed algorithm, and show that it converges to the set of appropriately defined fixed points, under arbitrary center initialization. In the special case of Bregman cost functions, the algorithm converges to the set of centroidal Voronoi partitions, which is consistent with prior works. Numerical experiments on real data demonstrate the effectiveness of the proposed method.", "bibtex": "@InProceedings{pmlr-v162-armacki22a,\n title = \t {Gradient Based Clustering},\n author = {Armacki, Aleksandar and Bajovic, Dragana and Jakovetic, Dusan and Kar, Soummya},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {929--947},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/armacki22a/armacki22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/armacki22a.html},\n abstract = \t {We propose a general approach for distance based clustering, using the gradient of the cost function that measures clustering quality with respect to cluster assignments and cluster center positions. The approach is an iterative two step procedure (alternating between cluster assignment and cluster center updates) and is applicable to a wide range of functions, satisfying some mild assumptions. The main advantage of the proposed approach is a simple and computationally cheap update rule. Unlike previous methods that specialize to a specific formulation of the clustering problem, our approach is applicable to a wide range of costs, including non-Bregman clustering methods based on the Huber loss. We analyze the convergence of the proposed algorithm, and show that it converges to the set of appropriately defined fixed points, under arbitrary center initialization. In the special case of Bregman cost functions, the algorithm converges to the set of centroidal Voronoi partitions, which is consistent with prior works. Numerical experiments on real data demonstrate the effectiveness of the proposed method.}\n}", "pdf": "https://proceedings.mlr.press/v162/armacki22a/armacki22a.pdf", "supp": "", "pdf_size": 638116, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=904721452533677863&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA; Faculty of Technical Sciences, University of Novi Sad, Novi Sad, Serbia; Faculty of Sciences, University of Novi Sad, Novi Sad, Serbia; Department of Electrical and Computer Engineering, Carnegie Mellon University, Pittsburgh, PA, USA", "aff_domain": "andrew.cmu.edu; ; ; ", "email": "andrew.cmu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/armacki22a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Carnegie Mellon University;University of Novi Sad", "aff_unique_dep": "Department of Electrical and Computer Engineering;Faculty of Technical Sciences", "aff_unique_url": "https://www.cmu.edu;https://www.uns.ac.rs", "aff_unique_abbr": "CMU;", "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Pittsburgh;Novi Sad", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;Serbia" }, { "title": "Gradient Descent on Neurons and its Link to Approximate Second-order Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16353", "id": "16353", "proceeding": "https://proceedings.mlr.press/v162/benzing22a.html", "poster": "", "slides": "", "author": "Frederik Benzing", "abstract": "Second-order optimizers are thought to hold the potential to speed up neural network training, but due to the enormous size of the curvature matrix, they typically require approximations to be computationally tractable. The most successful family of approximations are Kronecker-Factored, block-diagonal curvature estimates (KFAC). Here, we combine tools from prior work to evaluate exact second-order updates with careful ablations to establish a surprising result: Due to its approximations, KFAC is not closely related to second-order updates, and in particular, it significantly outperforms true second-order updates. This challenges widely held believes and immediately raises the question why KFAC performs so well. Towards answering this question we present evidence strongly suggesting that KFAC approximates a first-order algorithm, which performs gradient descent on neurons rather than weights. Finally, we show that this optimizer often improves over KFAC in terms of computational cost and data-efficiency.", "bibtex": "@InProceedings{pmlr-v162-benzing22a,\n title = \t {Gradient Descent on Neurons and its Link to Approximate Second-order Optimization},\n author = {Benzing, Frederik},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1817--1853},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/benzing22a/benzing22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/benzing22a.html},\n abstract = \t {Second-order optimizers are thought to hold the potential to speed up neural network training, but due to the enormous size of the curvature matrix, they typically require approximations to be computationally tractable. The most successful family of approximations are Kronecker-Factored, block-diagonal curvature estimates (KFAC). Here, we combine tools from prior work to evaluate exact second-order updates with careful ablations to establish a surprising result: Due to its approximations, KFAC is not closely related to second-order updates, and in particular, it significantly outperforms true second-order updates. This challenges widely held believes and immediately raises the question why KFAC performs so well. Towards answering this question we present evidence strongly suggesting that KFAC approximates a first-order algorithm, which performs gradient descent on neurons rather than weights. Finally, we show that this optimizer often improves over KFAC in terms of computational cost and data-efficiency.}\n}", "pdf": "https://proceedings.mlr.press/v162/benzing22a/benzing22a.pdf", "supp": "", "pdf_size": 1987676, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4847605706007812580&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, ETH Zurich, Zurich, Switzerland", "aff_domain": "inf.ethz.ch", "email": "inf.ethz.ch", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/benzing22a.html", "aff_unique_index": "0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "0", "aff_campus_unique": "Zurich", "aff_country_unique_index": "0", "aff_country_unique": "Switzerland" }, { "title": "Gradient-Free Method for Heavily Constrained Nonconvex Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17461", "id": "17461", "proceeding": "https://proceedings.mlr.press/v162/shi22a.html", "poster": "/media/PosterPDFs/ICML%202022/7d12b66d3df6af8d429c1a357d8b9e1a.png?t=1657629285.7463837", "slides": "", "author_site": "Wanli Shi, Hongchang Gao, Bin Gu", "author": "Wanli Shi; Hongchang Gao; Bin Gu", "abstract": "Zeroth-order (ZO) method has been shown to be a powerful method for solving the optimization problem where explicit expression of the gradients is difficult or infeasible to obtain. Recently, due to the practical value of the constrained problems, a lot of ZO Frank-Wolfe or projected ZO methods have been proposed. However, in many applications, we may have a very large number of nonconvex white/black-box constraints, which makes the existing zeroth-order methods extremely inefficient (or even not working) since they need to inquire function value of all the constraints and project the solution to the complicated feasible set. In this paper, to solve the nonconvex problem with a large number of white/black-box constraints, we proposed a doubly stochastic zeroth-order gradient method (DSZOG) with momentum method and adaptive step size. Theoretically, we prove DSZOG can converge to the $\\epsilon$-stationary point of the constrained problem. Experimental results in two applications demonstrate the superiority of our method in terms of training time and accuracy compared with other ZO methods for the constrained problem.", "bibtex": "@InProceedings{pmlr-v162-shi22a,\n title = \t {Gradient-Free Method for Heavily Constrained Nonconvex Optimization},\n author = {Shi, Wanli and Gao, Hongchang and Gu, Bin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19935--19955},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shi22a/shi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/shi22a.html},\n abstract = \t {Zeroth-order (ZO) method has been shown to be a powerful method for solving the optimization problem where explicit expression of the gradients is difficult or infeasible to obtain. Recently, due to the practical value of the constrained problems, a lot of ZO Frank-Wolfe or projected ZO methods have been proposed. However, in many applications, we may have a very large number of nonconvex white/black-box constraints, which makes the existing zeroth-order methods extremely inefficient (or even not working) since they need to inquire function value of all the constraints and project the solution to the complicated feasible set. In this paper, to solve the nonconvex problem with a large number of white/black-box constraints, we proposed a doubly stochastic zeroth-order gradient method (DSZOG) with momentum method and adaptive step size. Theoretically, we prove DSZOG can converge to the $\\epsilon$-stationary point of the constrained problem. Experimental results in two applications demonstrate the superiority of our method in terms of training time and accuracy compared with other ZO methods for the constrained problem.}\n}", "pdf": "https://proceedings.mlr.press/v162/shi22a/shi22a.pdf", "supp": "", "pdf_size": 415297, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9220440702935051018&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Nanjing University of Information Science and Technology, Jiangsu, China+MBZUAI, Abu Dhabi, UAE; Department of Computer and Information Sciences, Temple University, PA, USA; Nanjing University of Information Science and Technology, Jiangsu, China+MBZUAI, Abu Dhabi, UAE", "aff_domain": "gmail.com; ;gmail.com", "email": "gmail.com; ;gmail.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/shi22a.html", "aff_unique_index": "0+1;2;0+1", "aff_unique_norm": "Nanjing University of Information Science and Technology;Mohamed bin Zayed University of Artificial Intelligence;Temple University", "aff_unique_dep": ";;Department of Computer and Information Sciences", "aff_unique_url": "http://www.nuist.edu.cn;https://www.mbzuali.ac.ae;https://www.temple.edu", "aff_unique_abbr": "NUIST;MBZUAI;Temple", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Abu Dhabi;PA", "aff_country_unique_index": "0+1;2;0+1", "aff_country_unique": "China;United Arab Emirates;United States" }, { "title": "Graph Neural Architecture Search Under Distribution Shifts", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17739", "id": "17739", "proceeding": "https://proceedings.mlr.press/v162/qin22b.html", "poster": "", "slides": "", "author_site": "Yijian Qin, Xin Wang, Ziwei Zhang, Pengtao Xie, Wenwu Zhu", "author": "Yijian Qin; Xin Wang; Ziwei Zhang; Pengtao Xie; Wenwu Zhu", "abstract": "Graph neural architecture search has shown great potentials for automatically designing graph neural network (GNN) architectures for graph classification tasks. However, when there is a distribution shift between training and testing graphs, the existing approaches fail to deal with the problem of adapting to unknown test graph structures since they only search for a fixed architecture for all graphs. To solve this problem, we propose a novel GRACES model which is able to generalize under distribution shifts through tailoring a customized GNN architecture suitable for each graph instance with unknown distribution. Specifically, we design a self-supervised disentangled graph encoder to characterize invariant factors hidden in diverse graph structures. Then, we propose a prototype-based architecture customization strategy to generate the most suitable GNN architecture weights in a continuous space for each graph instance. We further propose a customized super-network to share weights among different architectures for the sake of efficient training. Extensive experiments on both synthetic and real-world datasets demonstrate that our proposed GRACES model can adapt to diverse graph structures and achieve state-of-the-art performance for graph classification tasks under distribution shifts.", "bibtex": "@InProceedings{pmlr-v162-qin22b,\n title = \t {Graph Neural Architecture Search Under Distribution Shifts},\n author = {Qin, Yijian and Wang, Xin and Zhang, Ziwei and Xie, Pengtao and Zhu, Wenwu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18083--18095},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qin22b/qin22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/qin22b.html},\n abstract = \t {Graph neural architecture search has shown great potentials for automatically designing graph neural network (GNN) architectures for graph classification tasks. However, when there is a distribution shift between training and testing graphs, the existing approaches fail to deal with the problem of adapting to unknown test graph structures since they only search for a fixed architecture for all graphs. To solve this problem, we propose a novel GRACES model which is able to generalize under distribution shifts through tailoring a customized GNN architecture suitable for each graph instance with unknown distribution. Specifically, we design a self-supervised disentangled graph encoder to characterize invariant factors hidden in diverse graph structures. Then, we propose a prototype-based architecture customization strategy to generate the most suitable GNN architecture weights in a continuous space for each graph instance. We further propose a customized super-network to share weights among different architectures for the sake of efficient training. Extensive experiments on both synthetic and real-world datasets demonstrate that our proposed GRACES model can adapt to diverse graph structures and achieve state-of-the-art performance for graph classification tasks under distribution shifts.}\n}", "pdf": "https://proceedings.mlr.press/v162/qin22b/qin22b.pdf", "supp": "", "pdf_size": 619799, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10266971519085102072&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science and Technology, Tsinghua University; Department of Computer Science and Technology, Tsinghua University + THU-Bosch JCML center; Department of Computer Science and Technology, Tsinghua University; UC San Diego; Department of Computer Science and Technology, Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;ucsd.edu;tsinghua.edu.cn", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;ucsd.edu;tsinghua.edu.cn", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/qin22b.html", "aff_unique_index": "0;0+0;0;1;0", "aff_unique_norm": "Tsinghua University;University of California, San Diego", "aff_unique_dep": "Department of Computer Science and Technology;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ucsd.edu", "aff_unique_abbr": "THU;UCSD", "aff_campus_unique_index": ";1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0+0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Graph-Coupled Oscillator Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17241", "id": "17241", "proceeding": "https://proceedings.mlr.press/v162/rusch22a.html", "poster": "/media/PosterPDFs/ICML%202022/810462d01f318bd13e628a77fc3f92c0.png?t=1657560605.2922451", "slides": "", "author_site": "T. Konstantin Rusch, Ben Chamberlain, James Rowbottom, Siddhartha Mishra, Michael Bronstein", "author": "T. Konstantin Rusch; Ben Chamberlain; James Rowbottom; Siddhartha Mishra; Michael Bronstein", "abstract": "We propose Graph-Coupled Oscillator Networks (GraphCON), a novel framework for deep learning on graphs. It is based on discretizations of a second-order system of ordinary differential equations (ODEs), which model a network of nonlinear controlled and damped oscillators, coupled via the adjacency structure of the underlying graph. The flexibility of our framework permits any basic GNN layer (e.g. convolutional or attentional) as the coupling function, from which a multi-layer deep neural network is built up via the dynamics of the proposed ODEs. We relate the oversmoothing problem, commonly encountered in GNNs, to the stability of steady states of the underlying ODE and show that zero-Dirichlet energy steady states are not stable for our proposed ODEs. This demonstrates that the proposed framework mitigates the oversmoothing problem. Moreover, we prove that GraphCON mitigates the exploding and vanishing gradients problem to facilitate training of deep multi-layer GNNs. Finally, we show that our approach offers competitive performance with respect to the state-of-the-art on a variety of graph-based learning tasks.", "bibtex": "@InProceedings{pmlr-v162-rusch22a,\n title = \t {Graph-Coupled Oscillator Networks},\n author = {Rusch, T. Konstantin and Chamberlain, Ben and Rowbottom, James and Mishra, Siddhartha and Bronstein, Michael},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18888--18909},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rusch22a/rusch22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rusch22a.html},\n abstract = \t {We propose Graph-Coupled Oscillator Networks (GraphCON), a novel framework for deep learning on graphs. It is based on discretizations of a second-order system of ordinary differential equations (ODEs), which model a network of nonlinear controlled and damped oscillators, coupled via the adjacency structure of the underlying graph. The flexibility of our framework permits any basic GNN layer (e.g. convolutional or attentional) as the coupling function, from which a multi-layer deep neural network is built up via the dynamics of the proposed ODEs. We relate the oversmoothing problem, commonly encountered in GNNs, to the stability of steady states of the underlying ODE and show that zero-Dirichlet energy steady states are not stable for our proposed ODEs. This demonstrates that the proposed framework mitigates the oversmoothing problem. Moreover, we prove that GraphCON mitigates the exploding and vanishing gradients problem to facilitate training of deep multi-layer GNNs. Finally, we show that our approach offers competitive performance with respect to the state-of-the-art on a variety of graph-based learning tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/rusch22a/rusch22a.pdf", "supp": "", "pdf_size": 585735, "gs_citation": 160, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9009434155878040135&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Seminar for Applied Mathematics (SAM), D-MATH, ETH Z\u00fcrich, Switzerland+ETH AI Center, ETH Z\u00fcrich; Twitter Inc., London, UK; Twitter Inc., London, UK; Seminar for Applied Mathematics (SAM), D-MATH, ETH Z\u00fcrich, Switzerland+ETH AI Center, ETH Z\u00fcrich; Department of Computer Science, University of Oxford, UK+Twitter Inc., London, UK", "aff_domain": "sam.math.ethz.ch; ; ; ; ", "email": "sam.math.ethz.ch; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/rusch22a.html", "aff_unique_index": "0+0;1;1;0+0;2+1", "aff_unique_norm": "ETH Zurich;Twitter Inc.;University of Oxford", "aff_unique_dep": "Seminar for Applied Mathematics (SAM);;Department of Computer Science", "aff_unique_url": "https://www.ethz.ch;https://twitter.com;https://www.ox.ac.uk", "aff_unique_abbr": "ETH;Twitter;Oxford", "aff_campus_unique_index": "0;2;2;0;2", "aff_campus_unique": "Z\u00fcrich;;London", "aff_country_unique_index": "0+0;1;1;0+0;1+1", "aff_country_unique": "Switzerland;United Kingdom" }, { "title": "GraphFM: Improving Large-Scale GNN Training via Feature Momentum", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16579", "id": "16579", "proceeding": "https://proceedings.mlr.press/v162/yu22g.html", "poster": "/media/PosterPDFs/ICML%202022/dbb240d23ce3d732b67bcfbae5956b18.png?t=1657854553.3980834", "slides": "", "author_site": "Haiyang Yu, Limei Wang, Bokun Wang, Meng Liu, Tianbao Yang, Shuiwang Ji", "author": "Haiyang Yu; Limei Wang; Bokun Wang; Meng Liu; Tianbao Yang; Shuiwang Ji", "abstract": "Training of graph neural networks (GNNs) for large-scale node classification is challenging. A key difficulty lies in obtaining accurate hidden node representations while avoiding the neighborhood explosion problem. Here, we propose a new technique, named feature momentum (FM), that uses a momentum step to incorporate historical embeddings when updating feature representations. We develop two specific algorithms, known as GraphFM-IB and GraphFM-OB, that consider in-batch and out-of-batch data, respectively. GraphFM-IB applies FM to in-batch sampled data, while GraphFM-OB applies FM to out-of-batch data that are 1-hop neighborhood of in-batch data. We provide a convergence analysis for GraphFM-IB and some theoretical insight for GraphFM-OB. Empirically, we observe that GraphFM-IB can effectively alleviate the neighborhood explosion problem of existing methods. In addition, GraphFM-OB achieves promising performance on multiple large-scale graph datasets.", "bibtex": "@InProceedings{pmlr-v162-yu22g,\n title = \t {{G}raph{FM}: Improving Large-Scale {GNN} Training via Feature Momentum},\n author = {Yu, Haiyang and Wang, Limei and Wang, Bokun and Liu, Meng and Yang, Tianbao and Ji, Shuiwang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25684--25701},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yu22g/yu22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/yu22g.html},\n abstract = \t {Training of graph neural networks (GNNs) for large-scale node classification is challenging. A key difficulty lies in obtaining accurate hidden node representations while avoiding the neighborhood explosion problem. Here, we propose a new technique, named feature momentum (FM), that uses a momentum step to incorporate historical embeddings when updating feature representations. We develop two specific algorithms, known as GraphFM-IB and GraphFM-OB, that consider in-batch and out-of-batch data, respectively. GraphFM-IB applies FM to in-batch sampled data, while GraphFM-OB applies FM to out-of-batch data that are 1-hop neighborhood of in-batch data. We provide a convergence analysis for GraphFM-IB and some theoretical insight for GraphFM-OB. Empirically, we observe that GraphFM-IB can effectively alleviate the neighborhood explosion problem of existing methods. In addition, GraphFM-OB achieves promising performance on multiple large-scale graph datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/yu22g/yu22g.pdf", "supp": "", "pdf_size": 951657, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14093235266162728639&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science & Engineering, Texas A&M University; Department of Computer Science & Engineering, Texas A&M University; Department of Computer Science, The University of Iowa; Department of Computer Science & Engineering, Texas A&M University; Department of Computer Science, The University of Iowa; Department of Computer Science & Engineering, Texas A&M University", "aff_domain": "tamu.edu; ; ; ; ;tamu.edu", "email": "tamu.edu; ; ; ; ;tamu.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/yu22g.html", "aff_unique_index": "0;0;1;0;1;0", "aff_unique_norm": "Texas A&M University;University of Iowa", "aff_unique_dep": "Department of Computer Science & Engineering;Department of Computer Science", "aff_unique_url": "https://www.tamu.edu;https://www.uiowa.edu", "aff_unique_abbr": "TAMU;UIowa", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Greedy based Value Representation for Optimal Coordination in Multi-agent Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15983", "id": "15983", "proceeding": "https://proceedings.mlr.press/v162/wan22c.html", "poster": "/media/PosterPDFs/ICML%202022/24146db4eb48c718b84cae0a0799dcfc.png?t=1658055412.9579477", "slides": "/media/icml-2022/Slides/15983.pdf", "author_site": "Lipeng Wan, Zeyang Liu, Xingyu Chen, Xuguang Lan, Nanning Zheng", "author": "Lipeng Wan; Zeyang Liu; Xingyu Chen; Xuguang Lan; Nanning Zheng", "abstract": "Due to the representation limitation of the joint Q value function, multi-agent reinforcement learning methods with linear value decomposition (LVD) or monotonic value decomposition (MVD) suffer from relative overgeneralization. As a result, they can not ensure optimal consistency (i.e., the correspondence between individual greedy actions and the best team performance). In this paper, we derive the expression of the joint Q value function of LVD and MVD. According to the expression, we draw a transition diagram, where each self-transition node (STN) is a possible convergence. To ensure the optimal consistency, the optimal node is required to be the unique STN. Therefore, we propose the greedy-based value representation (GVR), which turns the optimal node into an STN via inferior target shaping and eliminates the non-optimal STNs via superior experience replay. Theoretical proofs and empirical results demonstrate that given the true Q values, GVR ensures the optimal consistency under sufficient exploration. Besides, in tasks where the true Q values are unavailable, GVR achieves an adaptive trade-off between optimality and stability. Our method outperforms state-of-the-art baselines in experiments on various benchmarks.", "bibtex": "@InProceedings{pmlr-v162-wan22c,\n title = \t {Greedy based Value Representation for Optimal Coordination in Multi-agent Reinforcement Learning},\n author = {Wan, Lipeng and Liu, Zeyang and Chen, Xingyu and Lan, Xuguang and Zheng, Nanning},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22512--22535},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wan22c/wan22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/wan22c.html},\n abstract = \t {Due to the representation limitation of the joint Q value function, multi-agent reinforcement learning methods with linear value decomposition (LVD) or monotonic value decomposition (MVD) suffer from relative overgeneralization. As a result, they can not ensure optimal consistency (i.e., the correspondence between individual greedy actions and the best team performance). In this paper, we derive the expression of the joint Q value function of LVD and MVD. According to the expression, we draw a transition diagram, where each self-transition node (STN) is a possible convergence. To ensure the optimal consistency, the optimal node is required to be the unique STN. Therefore, we propose the greedy-based value representation (GVR), which turns the optimal node into an STN via inferior target shaping and eliminates the non-optimal STNs via superior experience replay. Theoretical proofs and empirical results demonstrate that given the true Q values, GVR ensures the optimal consistency under sufficient exploration. Besides, in tasks where the true Q values are unavailable, GVR achieves an adaptive trade-off between optimality and stability. Our method outperforms state-of-the-art baselines in experiments on various benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/wan22c/wan22c.pdf", "supp": "", "pdf_size": 14550625, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8677120929836692417&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Artificial Intelligence, Xian Jiaotong University, Xian, Shaanxi, China; School of Artificial Intelligence, Xian Jiaotong University, Xian, Shaanxi, China; School of Artificial Intelligence, Xian Jiaotong University, Xian, Shaanxi, China; School of Artificial Intelligence, Xian Jiaotong University, Xian, Shaanxi, China; School of Artificial Intelligence, Xian Jiaotong University, Xian, Shaanxi, China", "aff_domain": "mail.xjtu.edu.cn; ; ; ; ", "email": "mail.xjtu.edu.cn; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/wan22c.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Xian Jiao Tong University", "aff_unique_dep": "School of Artificial Intelligence", "aff_unique_url": "http://www.xjtu.edu.cn", "aff_unique_abbr": "XJTU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Xian", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Greedy when Sure and Conservative when Uncertain about the Opponents", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17557", "id": "17557", "proceeding": "https://proceedings.mlr.press/v162/fu22b.html", "poster": "/media/PosterPDFs/ICML%202022/a7c628dced6a691f1fd31aebc647a0a8.png?t=1656074191.281882", "slides": "/media/icml-2022/Slides/17557.pdf", "author_site": "Haobo Fu, Ye Tian, Hongxiang Yu, Weiming Liu, Shuang Wu, Jiechao Xiong, Ying Wen, Kai Li, Junliang Xing, Qiang Fu, Wei Yang", "author": "Haobo Fu; Ye Tian; Hongxiang Yu; Weiming Liu; Shuang Wu; Jiechao Xiong; Ying Wen; Kai Li; Junliang Xing; Qiang Fu; Wei Yang", "abstract": "We develop a new approach, named Greedy when Sure and Conservative when Uncertain (GSCU), to competing online against unknown and nonstationary opponents. GSCU improves in four aspects: 1) introduces a novel way of learning opponent policy embeddings offline; 2) trains offline a single best response (conditional additionally on our opponent policy embedding) instead of a finite set of separate best responses against any opponent; 3) computes online a posterior of the current opponent policy embedding, without making the discrete and ineffective decision which type the current opponent belongs to; and 4) selects online between a real-time greedy policy and a fixed conservative policy via an adversarial bandit algorithm, gaining a theoretically better regret than adhering to either. Experimental studies on popular benchmarks demonstrate GSCU\u2019s superiority over the state-of-the-art methods. The code is available online at \\url{https://github.com/YeTianJHU/GSCU}.", "bibtex": "@InProceedings{pmlr-v162-fu22b,\n title = \t {Greedy when Sure and Conservative when Uncertain about the Opponents},\n author = {Fu, Haobo and Tian, Ye and Yu, Hongxiang and Liu, Weiming and Wu, Shuang and Xiong, Jiechao and Wen, Ying and Li, Kai and Xing, Junliang and Fu, Qiang and Yang, Wei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6829--6848},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fu22b/fu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/fu22b.html},\n abstract = \t {We develop a new approach, named Greedy when Sure and Conservative when Uncertain (GSCU), to competing online against unknown and nonstationary opponents. GSCU improves in four aspects: 1) introduces a novel way of learning opponent policy embeddings offline; 2) trains offline a single best response (conditional additionally on our opponent policy embedding) instead of a finite set of separate best responses against any opponent; 3) computes online a posterior of the current opponent policy embedding, without making the discrete and ineffective decision which type the current opponent belongs to; and 4) selects online between a real-time greedy policy and a fixed conservative policy via an adversarial bandit algorithm, gaining a theoretically better regret than adhering to either. Experimental studies on popular benchmarks demonstrate GSCU\u2019s superiority over the state-of-the-art methods. The code is available online at \\url{https://github.com/YeTianJHU/GSCU}.}\n}", "pdf": "https://proceedings.mlr.press/v162/fu22b/fu22b.pdf", "supp": "", "pdf_size": 1206049, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17801434102156771639&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": ";;;;;;;;;;", "aff_domain": ";;;;;;;;;;", "email": ";;;;;;;;;;", "github": "https://github.com/YeTianJHU/GSCU", "project": "", "author_num": 11, "oa": "https://proceedings.mlr.press/v162/fu22b.html" }, { "title": "Guarantees for Epsilon-Greedy Reinforcement Learning with Function Approximation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18295", "id": "18295", "proceeding": "https://proceedings.mlr.press/v162/dann22a.html", "poster": "/media/PosterPDFs/ICML%202022/100d9f30ca54b18d14821dc88fea0631.png?t=1658090568.736106", "slides": "", "author_site": "Chris Dann, Yishay Mansour, Mehryar Mohri, Ayush Sekhari, Karthik Sridharan", "author": "Chris Dann; Yishay Mansour; Mehryar Mohri; Ayush Sekhari; Karthik Sridharan", "abstract": "Myopic exploration policies such as epsilon-greedy, softmax, or Gaussian noise fail to explore efficiently in some reinforcement learning tasks and yet, they perform well in many others. In fact, in practice, they are often selected as the top choices, due to their simplicity. But, for what tasks do such policies succeed? Can we give theoretical guarantees for their favorable performance? These crucial questions have been scarcely investigated, despite the prominent practical importance of these policies. This paper presents a theoretical analysis of such policies and provides the first regret and sample-complexity bounds for reinforcement learning with myopic exploration. Our results apply to value-function-based algorithms in episodic MDPs with bounded Bellman Eluder dimension. We propose a new complexity measure called myopic exploration gap, denoted by alpha, that captures a structural property of the MDP, the exploration policy and the given value function class. We show that the sample-complexity of myopic exploration scales quadratically with the inverse of this quantity, 1 / alpha^2. We further demonstrate through concrete examples that myopic exploration gap is indeed favorable in several tasks where myopic exploration succeeds, due to the corresponding dynamics and reward structure.", "bibtex": "@InProceedings{pmlr-v162-dann22a,\n title = \t {Guarantees for Epsilon-Greedy Reinforcement Learning with Function Approximation},\n author = {Dann, Chris and Mansour, Yishay and Mohri, Mehryar and Sekhari, Ayush and Sridharan, Karthik},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4666--4689},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dann22a/dann22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dann22a.html},\n abstract = \t {Myopic exploration policies such as epsilon-greedy, softmax, or Gaussian noise fail to explore efficiently in some reinforcement learning tasks and yet, they perform well in many others. In fact, in practice, they are often selected as the top choices, due to their simplicity. But, for what tasks do such policies succeed? Can we give theoretical guarantees for their favorable performance? These crucial questions have been scarcely investigated, despite the prominent practical importance of these policies. This paper presents a theoretical analysis of such policies and provides the first regret and sample-complexity bounds for reinforcement learning with myopic exploration. Our results apply to value-function-based algorithms in episodic MDPs with bounded Bellman Eluder dimension. We propose a new complexity measure called myopic exploration gap, denoted by alpha, that captures a structural property of the MDP, the exploration policy and the given value function class. We show that the sample-complexity of myopic exploration scales quadratically with the inverse of this quantity, 1 / alpha^2. We further demonstrate through concrete examples that myopic exploration gap is indeed favorable in several tasks where myopic exploration succeeds, due to the corresponding dynamics and reward structure.}\n}", "pdf": "https://proceedings.mlr.press/v162/dann22a/dann22a.pdf", "supp": "", "pdf_size": 924057, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14338628006434539205&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Google Research; Google Research + Tel Aviv University; Google Research + Courant Institute of Mathematical Sciences; Cornell University; Cornell University", "aff_domain": "cdann.net; ; ; ; ", "email": "cdann.net; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/dann22a.html", "aff_unique_index": "0;0+1;0+2;3;3", "aff_unique_norm": "Google;Tel Aviv University;Courant Institute of Mathematical Sciences;Cornell University", "aff_unique_dep": "Google Research;;Mathematical Sciences;", "aff_unique_url": "https://research.google;https://www.tau.ac.il;https://cims.nyu.edu;https://www.cornell.edu", "aff_unique_abbr": "Google Research;TAU;CIMS;Cornell", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0+1;0+0;0;0", "aff_country_unique": "United States;Israel" }, { "title": "Guided-TTS: A Diffusion Model for Text-to-Speech via Classifier Guidance", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17137", "id": "17137", "proceeding": "https://proceedings.mlr.press/v162/kim22d.html", "poster": "/media/PosterPDFs/ICML%202022/82161242827b703e6acf9c726942a1e4_Kjt9sKC.png?t=1657612269.330605", "slides": "", "author_site": "Heeseung Kim, Sungwon Kim, Sungroh Yoon", "author": "Heeseung Kim; Sungwon Kim; Sungroh Yoon", "abstract": "We propose Guided-TTS, a high-quality text-to-speech (TTS) model that does not require any transcript of target speaker using classifier guidance. Guided-TTS combines an unconditional diffusion probabilistic model with a separately trained phoneme classifier for classifier guidance. Our unconditional diffusion model learns to generate speech without any context from untranscribed speech data. For TTS synthesis, we guide the generative process of the diffusion model with a phoneme classifier trained on a large-scale speech recognition dataset. We present a norm-based scaling method that reduces the pronunciation errors of classifier guidance in Guided-TTS. We show that Guided-TTS achieves a performance comparable to that of the state-of-the-art TTS model, Grad-TTS, without any transcript for LJSpeech. We further demonstrate that Guided-TTS performs well on diverse datasets including a long-form untranscribed dataset.", "bibtex": "@InProceedings{pmlr-v162-kim22d,\n title = \t {Guided-{TTS}: A Diffusion Model for Text-to-Speech via Classifier Guidance},\n author = {Kim, Heeseung and Kim, Sungwon and Yoon, Sungroh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11119--11133},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kim22d/kim22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/kim22d.html},\n abstract = \t {We propose Guided-TTS, a high-quality text-to-speech (TTS) model that does not require any transcript of target speaker using classifier guidance. Guided-TTS combines an unconditional diffusion probabilistic model with a separately trained phoneme classifier for classifier guidance. Our unconditional diffusion model learns to generate speech without any context from untranscribed speech data. For TTS synthesis, we guide the generative process of the diffusion model with a phoneme classifier trained on a large-scale speech recognition dataset. We present a norm-based scaling method that reduces the pronunciation errors of classifier guidance in Guided-TTS. We show that Guided-TTS achieves a performance comparable to that of the state-of-the-art TTS model, Grad-TTS, without any transcript for LJSpeech. We further demonstrate that Guided-TTS performs well on diverse datasets including a long-form untranscribed dataset.}\n}", "pdf": "https://proceedings.mlr.press/v162/kim22d/kim22d.pdf", "supp": "", "pdf_size": 651902, "gs_citation": 112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15668427415140094463&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Data Science and AI Lab., Seoul National University; Data Science and AI Lab., Seoul National University; Department of ECE and Interdisciplinary Program in AI, Seoul National University", "aff_domain": "snu.ac.kr; ;snu.ac.kr", "email": "snu.ac.kr; ;snu.ac.kr", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kim22d.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "Data Science and AI Lab.", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "H-Consistency Bounds for Surrogate Loss Minimizers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16789", "id": "16789", "proceeding": "https://proceedings.mlr.press/v162/awasthi22c.html", "poster": "/media/PosterPDFs/ICML%202022/f862d13454fd267baa5fedfffb200567.png?t=1658174082.5243714", "slides": "", "author_site": "Pranjal Awasthi, Anqi Mao, Mehryar Mohri, Yutao Zhong", "author": "Pranjal Awasthi; Anqi Mao; Mehryar Mohri; Yutao Zhong", "abstract": "We present a detailed study of estimation errors in terms of surrogate loss estimation errors. We refer to such guarantees as H-consistency bounds, since they account for the hypothesis set H adopted. These guarantees are significantly stronger than H-calibration or H-consistency. They are also more informative than similar excess error bounds derived in the literature, when H is the family of all measurable functions. We prove general theorems providing such guarantees, for both the distribution-dependent and distribution-independent settings. We show that our bounds are tight, modulo a convexity assumption. We also show that previous excess error bounds can be recovered as special cases of our general results. We then present a series of explicit bounds in the case of the zero-one loss, with multiple choices of the surrogate loss and for both the family of linear functions and neural networks with one hidden-layer. We further prove more favorable distribution-dependent guarantees in that case. We also present a series of explicit bounds in the case of the adversarial loss, with surrogate losses based on the supremum of the $\\rho$-margin, hinge or sigmoid loss and for the same two general hypothesis sets. Here too, we prove several enhancements of these guarantees under natural distributional assumptions. Finally, we report the results of simulations illustrating our bounds and their tightness.", "bibtex": "@InProceedings{pmlr-v162-awasthi22c,\n title = \t {H-Consistency Bounds for Surrogate Loss Minimizers},\n author = {Awasthi, Pranjal and Mao, Anqi and Mohri, Mehryar and Zhong, Yutao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1117--1174},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/awasthi22c/awasthi22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/awasthi22c.html},\n abstract = \t {We present a detailed study of estimation errors in terms of surrogate loss estimation errors. We refer to such guarantees as H-consistency bounds, since they account for the hypothesis set H adopted. These guarantees are significantly stronger than H-calibration or H-consistency. They are also more informative than similar excess error bounds derived in the literature, when H is the family of all measurable functions. We prove general theorems providing such guarantees, for both the distribution-dependent and distribution-independent settings. We show that our bounds are tight, modulo a convexity assumption. We also show that previous excess error bounds can be recovered as special cases of our general results. We then present a series of explicit bounds in the case of the zero-one loss, with multiple choices of the surrogate loss and for both the family of linear functions and neural networks with one hidden-layer. We further prove more favorable distribution-dependent guarantees in that case. We also present a series of explicit bounds in the case of the adversarial loss, with surrogate losses based on the supremum of the $\\rho$-margin, hinge or sigmoid loss and for the same two general hypothesis sets. Here too, we prove several enhancements of these guarantees under natural distributional assumptions. Finally, we report the results of simulations illustrating our bounds and their tightness.}\n}", "pdf": "https://proceedings.mlr.press/v162/awasthi22c/awasthi22c.pdf", "supp": "", "pdf_size": 781160, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9743516823490361519&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/awasthi22c.html" }, { "title": "Hardness and Algorithms for Robust and Sparse Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17149", "id": "17149", "proceeding": "https://proceedings.mlr.press/v162/price22a.html", "poster": "", "slides": "", "author_site": "Eric Price, Sandeep Silwal, Samson Zhou", "author": "Eric Price; Sandeep Silwal; Samson Zhou", "abstract": "We explore algorithms and limitations for sparse optimization problems such as sparse linear regression and robust linear regression. The goal of the sparse linear regression problem is to identify a small number of key features, while the goal of the robust linear regression problem is to identify a small number of erroneous measurements. Specifically, the sparse linear regression problem seeks a $k$-sparse vector $x\\in\\mathbb{R}^d$ to minimize $\\|Ax-b\\|_2$, given an input matrix $A\\in\\mathbb{R}^{n\\times d}$ and a target vector $b\\in\\mathbb{R}^n$, while the robust linear regression problem seeks a set $S$ that ignores at most $k$ rows and a vector $x$ to minimize $\\|(Ax-b)_S\\|_2$. We first show bicriteria, NP-hardness of approximation for robust regression building on the work of \\cite{ODonnellWZ15} which implies a similar result for sparse regression. We further show fine-grained hardness of robust regression through a reduction from the minimum-weight $k$-clique conjecture. On the positive side, we give an algorithm for robust regression that achieves arbitrarily accurate additive error and uses runtime that closely matches the lower bound from the fine-grained hardness result, as well as an algorithm for sparse regression with similar runtime. Both our upper and lower bounds rely on a general reduction from robust linear regression to sparse regression that we introduce. Our algorithms, inspired by the 3SUM problem, use approximate nearest neighbor data structures and may be of independent interest for solving sparse optimization problems. For instance, we demonstrate that our techniques can also be used for the well-studied sparse PCA problem.", "bibtex": "@InProceedings{pmlr-v162-price22a,\n title = \t {Hardness and Algorithms for Robust and Sparse Optimization},\n author = {Price, Eric and Silwal, Sandeep and Zhou, Samson},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17926--17944},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/price22a/price22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/price22a.html},\n abstract = \t {We explore algorithms and limitations for sparse optimization problems such as sparse linear regression and robust linear regression. The goal of the sparse linear regression problem is to identify a small number of key features, while the goal of the robust linear regression problem is to identify a small number of erroneous measurements. Specifically, the sparse linear regression problem seeks a $k$-sparse vector $x\\in\\mathbb{R}^d$ to minimize $\\|Ax-b\\|_2$, given an input matrix $A\\in\\mathbb{R}^{n\\times d}$ and a target vector $b\\in\\mathbb{R}^n$, while the robust linear regression problem seeks a set $S$ that ignores at most $k$ rows and a vector $x$ to minimize $\\|(Ax-b)_S\\|_2$. We first show bicriteria, NP-hardness of approximation for robust regression building on the work of \\cite{ODonnellWZ15} which implies a similar result for sparse regression. We further show fine-grained hardness of robust regression through a reduction from the minimum-weight $k$-clique conjecture. On the positive side, we give an algorithm for robust regression that achieves arbitrarily accurate additive error and uses runtime that closely matches the lower bound from the fine-grained hardness result, as well as an algorithm for sparse regression with similar runtime. Both our upper and lower bounds rely on a general reduction from robust linear regression to sparse regression that we introduce. Our algorithms, inspired by the 3SUM problem, use approximate nearest neighbor data structures and may be of independent interest for solving sparse optimization problems. For instance, we demonstrate that our techniques can also be used for the well-studied sparse PCA problem.}\n}", "pdf": "https://proceedings.mlr.press/v162/price22a/price22a.pdf", "supp": "", "pdf_size": 394473, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12651541254620646864&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Electrical and Computer Engineering, The University of Texas at Austin; Electrical Engineering and Computer Science Department, Massachusetts Institute of Technology; Computer Science Department, Carnegie Mellon University", "aff_domain": "gmail.com;mit.edu;gmail.com", "email": "gmail.com;mit.edu;gmail.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/price22a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Texas at Austin;Massachusetts Institute of Technology;Carnegie Mellon University", "aff_unique_dep": "Department of Electrical and Computer Engineering;Electrical Engineering and Computer Science Department;Computer Science Department", "aff_unique_url": "https://www.utexas.edu;https://web.mit.edu;https://www.cmu.edu", "aff_unique_abbr": "UT Austin;MIT;CMU", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Austin;Cambridge;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Head2Toe: Utilizing Intermediate Representations for Better Transfer Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16679", "id": "16679", "proceeding": "https://proceedings.mlr.press/v162/evci22a.html", "poster": "/media/PosterPDFs/ICML%202022/28267ab848bcf807b2ed53c3a8f8fc8a.png?t=1657915919.0249746", "slides": "", "author_site": "Utku Evci, Vincent Dumoulin, Hugo Larochelle, Michael Mozer", "author": "Utku Evci; Vincent Dumoulin; Hugo Larochelle; Michael C Mozer", "abstract": "Transfer-learning methods aim to improve performance in a data-scarce target domain using a model pretrained on a data-rich source domain. A cost-efficient strategy, linear probing, involves freezing the source model and training a new classification head for the target domain. This strategy is outperformed by a more costly but state-of-the-art method \u2013 fine-tuning all parameters of the source model to the target domain \u2013 possibly because fine-tuning allows the model to leverage useful information from intermediate layers which is otherwise discarded by the later previously trained layers. We explore the hypothesis that these intermediate layers might be directly exploited. We propose a method, Head-to-Toe probing (Head2Toe), that selects features from all layers of the source model to train a classification head for the target-domain. In evaluations on the Visual Task Adaptation Benchmark-1k, Head2Toe matches performance obtained with fine-tuning on average while reducing training and storage cost hundred folds or more, but critically, for out-of-distribution transfer, Head2Toe outperforms fine-tuning. Code used in our experiments can be found in supplementary materials.", "bibtex": "@InProceedings{pmlr-v162-evci22a,\n title = \t {{H}ead2{T}oe: Utilizing Intermediate Representations for Better Transfer Learning},\n author = {Evci, Utku and Dumoulin, Vincent and Larochelle, Hugo and Mozer, Michael C},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6009--6033},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/evci22a/evci22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/evci22a.html},\n abstract = \t {Transfer-learning methods aim to improve performance in a data-scarce target domain using a model pretrained on a data-rich source domain. A cost-efficient strategy, linear probing, involves freezing the source model and training a new classification head for the target domain. This strategy is outperformed by a more costly but state-of-the-art method \u2013 fine-tuning all parameters of the source model to the target domain \u2013 possibly because fine-tuning allows the model to leverage useful information from intermediate layers which is otherwise discarded by the later previously trained layers. We explore the hypothesis that these intermediate layers might be directly exploited. We propose a method, Head-to-Toe probing (Head2Toe), that selects features from all layers of the source model to train a classification head for the target-domain. In evaluations on the Visual Task Adaptation Benchmark-1k, Head2Toe matches performance obtained with fine-tuning on average while reducing training and storage cost hundred folds or more, but critically, for out-of-distribution transfer, Head2Toe outperforms fine-tuning. Code used in our experiments can be found in supplementary materials.}\n}", "pdf": "https://proceedings.mlr.press/v162/evci22a/evci22a.pdf", "supp": "", "pdf_size": 1194634, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12027550380073751806&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "https://github.com/google-research/head2toe", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/evci22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Hermite Polynomial Features for Private Data Generation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16813", "id": "16813", "proceeding": "https://proceedings.mlr.press/v162/vinaroz22a.html", "poster": "/media/PosterPDFs/ICML%202022/f29c21d4897f78948b91f03172341b7b_wogoa4V.png?t=1657904654.555713", "slides": "", "author_site": "Margarita Vinaroz, Mohammad-Amin Charusaie, Frederik Harder, Kamil Adamczewski, Mi Jung Park", "author": "Margarita Vinaroz; Mohammad-Amin Charusaie; Frederik Harder; Kamil Adamczewski; Mi Jung Park", "abstract": "Kernel mean embedding is a useful tool to compare probability measures. Despite its usefulness, kernel mean embedding considers infinite-dimensional features, which are challenging to handle in the context of differentially private data generation. A recent work, DP-MERF (Harder et al., 2021), proposes to approximate the kernel mean embedding of data distribution using finite-dimensional random features, which yields an analytically tractable sensitivity of approximate kernel mean embedding. However, the required number of random features in DP-MERF is excessively high, often ten thousand to a hundred thousand, which worsens the sensitivity of the approximate kernel mean embedding. To improve the sensitivity, we propose to replace random features with Hermite polynomial features. Unlike the random features, the Hermite polynomial features are ordered, where the features at the low orders contain more information on the distribution than those at the high orders. Hence, a relatively low order of Hermite polynomial features can more accurately approximate the mean embedding of the data distribution compared to a significantly higher number of random features. As a result, the Hermite polynomial features help us to improve the privacy-accuracy trade-off compared to DP-MERF, as demonstrated on several heterogeneous tabular datasets, as well as several image benchmark datasets.", "bibtex": "@InProceedings{pmlr-v162-vinaroz22a,\n title = \t {Hermite Polynomial Features for Private Data Generation},\n author = {Vinaroz, Margarita and Charusaie, Mohammad-Amin and Harder, Frederik and Adamczewski, Kamil and Park, Mi Jung},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22300--22324},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vinaroz22a/vinaroz22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/vinaroz22a.html},\n abstract = \t {Kernel mean embedding is a useful tool to compare probability measures. Despite its usefulness, kernel mean embedding considers infinite-dimensional features, which are challenging to handle in the context of differentially private data generation. A recent work, DP-MERF (Harder et al., 2021), proposes to approximate the kernel mean embedding of data distribution using finite-dimensional random features, which yields an analytically tractable sensitivity of approximate kernel mean embedding. However, the required number of random features in DP-MERF is excessively high, often ten thousand to a hundred thousand, which worsens the sensitivity of the approximate kernel mean embedding. To improve the sensitivity, we propose to replace random features with Hermite polynomial features. Unlike the random features, the Hermite polynomial features are ordered, where the features at the low orders contain more information on the distribution than those at the high orders. Hence, a relatively low order of Hermite polynomial features can more accurately approximate the mean embedding of the data distribution compared to a significantly higher number of random features. As a result, the Hermite polynomial features help us to improve the privacy-accuracy trade-off compared to DP-MERF, as demonstrated on several heterogeneous tabular datasets, as well as several image benchmark datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/vinaroz22a/vinaroz22a.pdf", "supp": "", "pdf_size": 2857978, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16485118791106646859&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Max Planck Institute for Intelligent Systems, Tuebingen, Germany; Max Planck Institute for Intelligent Systems, Tuebingen, Germany; Max Planck Institute for Intelligent Systems, Tuebingen, Germany; Max Planck Institute for Intelligent Systems, Tuebingen, Germany; University of British Columbia, Vancouver, Canada. CIFAR AI Chair at AMII", "aff_domain": "tuebingen.mpg.de;tuebingen.mpg.de;tuebingen.mpg.de;tuebingen.mpg.de;cs.ubc.ca", "email": "tuebingen.mpg.de;tuebingen.mpg.de;tuebingen.mpg.de;tuebingen.mpg.de;cs.ubc.ca", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/vinaroz22a.html", "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of British Columbia", "aff_unique_dep": ";", "aff_unique_url": "https://www.mpituebingen.mpg.de;https://www.ubc.ca", "aff_unique_abbr": "MPI-IS;UBC", "aff_campus_unique_index": "0;0;0;0;1", "aff_campus_unique": "Tuebingen;Vancouver", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "Germany;Canada" }, { "title": "Hessian-Free High-Resolution Nesterov Acceleration For Sampling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16595", "id": "16595", "proceeding": "https://proceedings.mlr.press/v162/li22z.html", "poster": "", "slides": "", "author_site": "Ruilin Li, Hongyuan Zha, Molei Tao", "author": "Ruilin Li; Hongyuan Zha; Molei Tao", "abstract": "Nesterov\u2019s Accelerated Gradient (NAG) for optimization has better performance than its continuous time limit (noiseless kinetic Langevin) when a finite step-size is employed (Shi et al., 2021). This work explores the sampling counterpart of this phenonemon and proposes a diffusion process, whose discretizations can yield accelerated gradient-based MCMC methods. More precisely, we reformulate the optimizer of NAG for strongly convex functions (NAG-SC) as a Hessian-Free High-Resolution ODE, change its high-resolution coefficient to a hyperparameter, inject appropriate noise, and discretize the resulting diffusion process. The acceleration effect of the new hyperparameter is quantified and it is not an artificial one created by time-rescaling. Instead, acceleration beyond underdamped Langevin in $W_2$ distance is quantitatively established for log-strongly-concave-and-smooth targets, at both the continuous dynamics level and the discrete algorithm level. Empirical experiments in both log-strongly-concave and multi-modal cases also numerically demonstrate this acceleration.", "bibtex": "@InProceedings{pmlr-v162-li22z,\n title = \t {Hessian-Free High-Resolution {N}esterov Acceleration For Sampling},\n author = {Li, Ruilin and Zha, Hongyuan and Tao, Molei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13125--13162},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22z/li22z.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22z.html},\n abstract = \t {Nesterov\u2019s Accelerated Gradient (NAG) for optimization has better performance than its continuous time limit (noiseless kinetic Langevin) when a finite step-size is employed (Shi et al., 2021). This work explores the sampling counterpart of this phenonemon and proposes a diffusion process, whose discretizations can yield accelerated gradient-based MCMC methods. More precisely, we reformulate the optimizer of NAG for strongly convex functions (NAG-SC) as a Hessian-Free High-Resolution ODE, change its high-resolution coefficient to a hyperparameter, inject appropriate noise, and discretize the resulting diffusion process. The acceleration effect of the new hyperparameter is quantified and it is not an artificial one created by time-rescaling. Instead, acceleration beyond underdamped Langevin in $W_2$ distance is quantitatively established for log-strongly-concave-and-smooth targets, at both the continuous dynamics level and the discrete algorithm level. Empirical experiments in both log-strongly-concave and multi-modal cases also numerically demonstrate this acceleration.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22z/li22z.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/li22z-supp.zip", "pdf_size": 1064588, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7440675111047681058&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "School of Mathematics, Georgia Institute of Technology; School of Data Science, The Chinese University of Hong Kong, Shenzhen, Shenzhen Institute of Artificial Intelligence and Robotics for Society; School of Mathematics, Georgia Institute of Technology", "aff_domain": "gatech.edu; ;gatech.edu", "email": "gatech.edu; ;gatech.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/li22z.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Georgia Institute of Technology;Chinese University of Hong Kong", "aff_unique_dep": "School of Mathematics;School of Data Science", "aff_unique_url": "https://www.gatech.edu;https://www.cuhk.edu.cn", "aff_unique_abbr": "Georgia Tech;CUHK", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Atlanta;Shenzhen", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "title": "Hierarchical Shrinkage: Improving the accuracy and interpretability of tree-based models.", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16829", "id": "16829", "proceeding": "https://proceedings.mlr.press/v162/agarwal22b.html", "poster": "/media/PosterPDFs/ICML%202022/36ad8b5f42db492827016448975cc22d.png?t=1658188132.2535603", "slides": "", "author_site": "Abhineet Agarwal, Yan Shuo Tan, Omer Ronen, Chandan Singh, Bin Yu", "author": "Abhineet Agarwal; Yan Shuo Tan; Omer Ronen; Chandan Singh; Bin Yu", "abstract": "Decision trees and random forests (RF) are a cornerstone of modern machine learning practice. Due to their tendency to overfit, trees are typically regularized by a variety of techniques that modify their structure (e.g. pruning). We introduce Hierarchical Shrinkage (HS), a post-hoc algorithm which regularizes the tree not by altering its structure, but by shrinking the prediction over each leaf toward the sample means over each of its ancestors, with weights depending on a single regularization parameter and the number of samples in each ancestor. Since HS is a post-hoc method, it is extremely fast, compatible with any tree-growing algorithm and can be used synergistically with other regularization techniques. Extensive experiments over a wide variety of real-world datasets show that HS substantially increases the predictive performance of decision trees even when used in conjunction with other regularization techniques. Moreover, we find that applying HS to individual trees in a RF often improves its accuracy and interpretability by simplifying and stabilizing decision boundaries and SHAP values. We further explain HS by showing that it to be equivalent to ridge regression on a basis that is constructed of decision stumps associated to the internal nodes of a tree. All code and models are released in a full-fledged package available on Github", "bibtex": "@InProceedings{pmlr-v162-agarwal22b,\n title = \t {Hierarchical Shrinkage: Improving the accuracy and interpretability of tree-based models.},\n author = {Agarwal, Abhineet and Tan, Yan Shuo and Ronen, Omer and Singh, Chandan and Yu, Bin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {111--135},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/agarwal22b/agarwal22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/agarwal22b.html},\n abstract = \t {Decision trees and random forests (RF) are a cornerstone of modern machine learning practice. Due to their tendency to overfit, trees are typically regularized by a variety of techniques that modify their structure (e.g. pruning). We introduce Hierarchical Shrinkage (HS), a post-hoc algorithm which regularizes the tree not by altering its structure, but by shrinking the prediction over each leaf toward the sample means over each of its ancestors, with weights depending on a single regularization parameter and the number of samples in each ancestor. Since HS is a post-hoc method, it is extremely fast, compatible with any tree-growing algorithm and can be used synergistically with other regularization techniques. Extensive experiments over a wide variety of real-world datasets show that HS substantially increases the predictive performance of decision trees even when used in conjunction with other regularization techniques. Moreover, we find that applying HS to individual trees in a RF often improves its accuracy and interpretability by simplifying and stabilizing decision boundaries and SHAP values. We further explain HS by showing that it to be equivalent to ridge regression on a basis that is constructed of decision stumps associated to the internal nodes of a tree. All code and models are released in a full-fledged package available on Github}\n}", "pdf": "https://proceedings.mlr.press/v162/agarwal22b/agarwal22b.pdf", "supp": "", "pdf_size": 4546671, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12071306017639768084&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Statistics, UC Berkeley; Department of Statistics, UC Berkeley; Department of Statistics, UC Berkeley; EECS Department, UC Berkeley; Department of Statistics, UC Berkeley + EECS Department, UC Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "email": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "github": "https://github.com/csinva/imodels", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/agarwal22b.html", "aff_unique_index": "0;0;0;0;0+0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0;0+0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "High Probability Guarantees for Nonconvex Stochastic Gradient Descent with Heavy Tails", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18183", "id": "18183", "proceeding": "https://proceedings.mlr.press/v162/li22q.html", "poster": "/media/PosterPDFs/ICML%202022/f1e5284674fd1e360873c29337ebe2d7.png?t=1655671642.146303", "slides": "", "author_site": "Shaojie Li, Yong Liu", "author": "Shaojie Li; Yong Liu", "abstract": "Stochastic gradient descent (SGD) is the workhorse in modern machine learning and data-driven optimization. Despite its popularity, existing theoretical guarantees for SGD are mainly derived in expectation and for convex learning problems. High probability guarantees of nonconvex SGD are scarce, and typically rely on \u201clight-tail\u201d noise assumptions and study the optimization and generalization performance separately. In this paper, we develop high probability bounds for nonconvex SGD with a joint perspective of optimization and generalization performance. Instead of the light tail assumption, we consider the gradient noise following a heavy-tailed sub-Weibull distribution, a novel class generalizing the sub-Gaussian and sub-Exponential families to potentially heavier-tailed distributions. Under these complicated settings, we first present high probability bounds with best-known rates in general nonconvex learning, then move to nonconvex learning with a gradient dominance curvature condition, for which we improve the learning guarantees to fast rates. We further obtain sharper learning guarantees by considering a mild Bernstein-type noise condition. Our analysis also reveals the effect of trade-offs between the optimization and generalization performance under different conditions. In the last, we show that gradient clipping can be employed to remove the bounded gradient-type assumptions. Additionally, in this case, the stepsize of SGD is completely oblivious to the knowledge of smoothness.", "bibtex": "@InProceedings{pmlr-v162-li22q,\n title = \t {High Probability Guarantees for Nonconvex Stochastic Gradient Descent with Heavy Tails},\n author = {Li, Shaojie and Liu, Yong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12931--12963},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22q/li22q.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22q.html},\n abstract = \t {Stochastic gradient descent (SGD) is the workhorse in modern machine learning and data-driven optimization. Despite its popularity, existing theoretical guarantees for SGD are mainly derived in expectation and for convex learning problems. High probability guarantees of nonconvex SGD are scarce, and typically rely on \u201clight-tail\u201d noise assumptions and study the optimization and generalization performance separately. In this paper, we develop high probability bounds for nonconvex SGD with a joint perspective of optimization and generalization performance. Instead of the light tail assumption, we consider the gradient noise following a heavy-tailed sub-Weibull distribution, a novel class generalizing the sub-Gaussian and sub-Exponential families to potentially heavier-tailed distributions. Under these complicated settings, we first present high probability bounds with best-known rates in general nonconvex learning, then move to nonconvex learning with a gradient dominance curvature condition, for which we improve the learning guarantees to fast rates. We further obtain sharper learning guarantees by considering a mild Bernstein-type noise condition. Our analysis also reveals the effect of trade-offs between the optimization and generalization performance under different conditions. In the last, we show that gradient clipping can be employed to remove the bounded gradient-type assumptions. Additionally, in this case, the stepsize of SGD is completely oblivious to the knowledge of smoothness.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22q/li22q.pdf", "supp": "", "pdf_size": 433986, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12939682939918686047&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Gaoling School of Arti\ufb01cial Intelligence, Renmin University of China, Beijing, China+Beijing Key Laboratory of Big Data Management and Analysis Methods, Beijing, China; Gaoling School of Arti\ufb01cial Intelligence, Renmin University of China, Beijing, China+Beijing Key Laboratory of Big Data Management and Analysis Methods, Beijing, China", "aff_domain": "ruc.edu.cn;ruc.edu.cn", "email": "ruc.edu.cn;ruc.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/li22q.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Renmin University of China;Beijing Key Laboratory of Big Data Management and Analysis Methods", "aff_unique_dep": "Gaoling School of Arti\ufb01cial Intelligence;Big Data Management and Analysis", "aff_unique_url": "http://www.ruc.edu.cn;", "aff_unique_abbr": "RUC;", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "China" }, { "title": "Hindering Adversarial Attacks with Implicit Neural Representations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18019", "id": "18019", "proceeding": "https://proceedings.mlr.press/v162/rusu22a.html", "poster": "/media/PosterPDFs/ICML%202022/598a90004bace6540f0e2230bdc47c09.png?t=1658418735.275521", "slides": "/media/icml-2022/Slides/18019.pdf", "author_site": "Andrei A Rusu, Dan Andrei Calian, Sven Gowal, Raia Hadsell", "author": "Andrei A Rusu; Dan Andrei Calian; Sven Gowal; Raia Hadsell", "abstract": "We introduce the Lossy Implicit Network Activation Coding (LINAC) defence, an input transformation which successfully hinders several common adversarial attacks on CIFAR-10 classifiers for perturbations up to 8/255 in Linf norm and 0.5 in L2 norm. Implicit neural representations are used to approximately encode pixel colour intensities in 2D images such that classifiers trained on transformed data appear to have robustness to small perturbations without adversarial training or large drops in performance. The seed of the random number generator used to initialise and train the implicit neural representation turns out to be necessary information for stronger generic attacks, suggesting its role as a private key. We devise a Parametric Bypass Approximation (PBA) attack strategy for key-based defences, which successfully invalidates an existing method in this category. Interestingly, our LINAC defence also hinders some transfer and adaptive attacks, including our novel PBA strategy. Our results emphasise the importance of a broad range of customised attacks despite apparent robustness according to standard evaluations.", "bibtex": "@InProceedings{pmlr-v162-rusu22a,\n title = \t {Hindering Adversarial Attacks with Implicit Neural Representations},\n author = {Rusu, Andrei A and Calian, Dan Andrei and Gowal, Sven and Hadsell, Raia},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18910--18934},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rusu22a/rusu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rusu22a.html},\n abstract = \t {We introduce the Lossy Implicit Network Activation Coding (LINAC) defence, an input transformation which successfully hinders several common adversarial attacks on CIFAR-10 classifiers for perturbations up to 8/255 in Linf norm and 0.5 in L2 norm. Implicit neural representations are used to approximately encode pixel colour intensities in 2D images such that classifiers trained on transformed data appear to have robustness to small perturbations without adversarial training or large drops in performance. The seed of the random number generator used to initialise and train the implicit neural representation turns out to be necessary information for stronger generic attacks, suggesting its role as a private key. We devise a Parametric Bypass Approximation (PBA) attack strategy for key-based defences, which successfully invalidates an existing method in this category. Interestingly, our LINAC defence also hinders some transfer and adaptive attacks, including our novel PBA strategy. Our results emphasise the importance of a broad range of customised attacks despite apparent robustness according to standard evaluations.}\n}", "pdf": "https://proceedings.mlr.press/v162/rusu22a/rusu22a.pdf", "supp": "", "pdf_size": 4501640, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14287948960663739347&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK", "aff_domain": "deepmind.com; ; ; ", "email": "deepmind.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/rusu22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "History Compression via Language Models in Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17959", "id": "17959", "proceeding": "https://proceedings.mlr.press/v162/paischer22a.html", "poster": "/media/PosterPDFs/ICML%202022/f84d465177e84bb4e756a8319443cdcb.png?t=1657192495.71133", "slides": "", "author_site": "Fabian Paischer, Thomas Adler, Vihang Patil, Angela Bitto-Nemling, Markus Holzleitner, Sebastian Lehner, Hamid Eghbal-zadeh, Sepp Hochreiter", "author": "Fabian Paischer; Thomas Adler; Vihang Patil; Angela Bitto-Nemling; Markus Holzleitner; Sebastian Lehner; Hamid Eghbal-Zadeh; Sepp Hochreiter", "abstract": "In a partially observable Markov decision process (POMDP), an agent typically uses a representation of the past to approximate the underlying MDP. We propose to utilize a frozen Pretrained Language Transformer (PLT) for history representation and compression to improve sample efficiency. To avoid training of the Transformer, we introduce FrozenHopfield, which automatically associates observations with pretrained token embeddings. To form these associations, a modern Hopfield network stores these token embeddings, which are retrieved by queries that are obtained by a random but fixed projection of observations. Our new method, HELM, enables actor-critic network architectures that contain a pretrained language Transformer for history representation as a memory module. Since a representation of the past need not be learned, HELM is much more sample efficient than competitors. On Minigrid and Procgen environments HELM achieves new state-of-the-art results. Our code is available at https://github.com/ml-jku/helm.", "bibtex": "@InProceedings{pmlr-v162-paischer22a,\n title = \t {History Compression via Language Models in Reinforcement Learning},\n author = {Paischer, Fabian and Adler, Thomas and Patil, Vihang and Bitto-Nemling, Angela and Holzleitner, Markus and Lehner, Sebastian and Eghbal-Zadeh, Hamid and Hochreiter, Sepp},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17156--17185},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/paischer22a/paischer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/paischer22a.html},\n abstract = \t {In a partially observable Markov decision process (POMDP), an agent typically uses a representation of the past to approximate the underlying MDP. We propose to utilize a frozen Pretrained Language Transformer (PLT) for history representation and compression to improve sample efficiency. To avoid training of the Transformer, we introduce FrozenHopfield, which automatically associates observations with pretrained token embeddings. To form these associations, a modern Hopfield network stores these token embeddings, which are retrieved by queries that are obtained by a random but fixed projection of observations. Our new method, HELM, enables actor-critic network architectures that contain a pretrained language Transformer for history representation as a memory module. Since a representation of the past need not be learned, HELM is much more sample efficient than competitors. On Minigrid and Procgen environments HELM achieves new state-of-the-art results. Our code is available at https://github.com/ml-jku/helm.}\n}", "pdf": "https://proceedings.mlr.press/v162/paischer22a/paischer22a.pdf", "supp": "", "pdf_size": 5984785, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3335833011258515063&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz, Austria+ELLIS Unit Linz+Institute of Advanced Research in Artificial Intelligence (IARAI), Vienna, Austria; LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz, Austria+ELLIS Unit Linz; LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz, Austria; LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz, Austria+ELLIS Unit Linz+Institute of Advanced Research in Artificial Intelligence (IARAI), Vienna, Austria; LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz, Austria; LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz, Austria+ELLIS Unit Linz; LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz, Austria; LIT AI Lab, Institute for Machine Learning, Johannes Kepler University Linz, Austria+ELLIS Unit Linz+Institute of Advanced Research in Artificial Intelligence (IARAI), Vienna, Austria", "aff_domain": "ml.jku.at; ; ; ; ; ; ;", "email": "ml.jku.at; ; ; ; ; ; ;", "github": "https://github.com/ml-jku/helm", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/paischer22a.html", "aff_unique_index": "0+1+2;0+1;0;0+1+2;0;0+1;0;0+1+2", "aff_unique_norm": "Johannes Kepler University Linz;ELLIS Unit;Institute of Advanced Research in Artificial Intelligence", "aff_unique_dep": "Institute for Machine Learning;;", "aff_unique_url": "https://www.jku.at;https://ellis.eu;", "aff_unique_abbr": "JKU;ELLIS;IARAI", "aff_campus_unique_index": "0+0+1;0+0;0;0+0+1;0;0+0;0;0+0+1", "aff_campus_unique": "Linz;Vienna", "aff_country_unique_index": "0+0+0;0+0;0;0+0+0;0;0+0;0;0+0+0", "aff_country_unique": "Austria" }, { "title": "HousE: Knowledge Graph Embedding with Householder Parameterization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17407", "id": "17407", "proceeding": "https://proceedings.mlr.press/v162/li22ab.html", "poster": "/media/PosterPDFs/ICML%202022/d296c101daa88a51f6ca8cfc1ac79b50_SW4mlv1.png?t=1657723720.7178721", "slides": "", "author_site": "Rui Li, Jianan Zhao, Chaozhuo Li, Di He, Yiqi Wang, Yuming Liu, Hao Sun, Senzhang Wang, Weiwei Deng, Yanming Shen, Xing Xie, Qi Zhang", "author": "Rui Li; Jianan Zhao; Chaozhuo Li; Di He; Yiqi Wang; Yuming Liu; Hao Sun; Senzhang Wang; Weiwei Deng; Yanming Shen; Xing Xie; Qi Zhang", "abstract": "The effectiveness of knowledge graph embedding (KGE) largely depends on the ability to model intrinsic relation patterns and mapping properties. However, existing approaches can only capture some of them with insufficient modeling capacity. In this work, we propose a more powerful KGE framework named HousE, which involves a novel parameterization based on two kinds of Householder transformations: (1) Householder rotations to achieve superior capacity of modeling relation patterns; (2) Householder projections to handle sophisticated relation mapping properties. Theoretically, HousE is capable of modeling crucial relation patterns and mapping properties simultaneously. Besides, HousE is a generalization of existing rotation-based models while extending the rotations to high-dimensional spaces. Empirically, HousE achieves new state-of-the-art performance on five benchmark datasets. Our code is available at https://github.com/anrep/HousE.", "bibtex": "@InProceedings{pmlr-v162-li22ab,\n title = \t {{H}ous{E}: Knowledge Graph Embedding with Householder Parameterization},\n author = {Li, Rui and Zhao, Jianan and Li, Chaozhuo and He, Di and Wang, Yiqi and Liu, Yuming and Sun, Hao and Wang, Senzhang and Deng, Weiwei and Shen, Yanming and Xie, Xing and Zhang, Qi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13209--13224},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22ab/li22ab.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22ab.html},\n abstract = \t {The effectiveness of knowledge graph embedding (KGE) largely depends on the ability to model intrinsic relation patterns and mapping properties. However, existing approaches can only capture some of them with insufficient modeling capacity. In this work, we propose a more powerful KGE framework named HousE, which involves a novel parameterization based on two kinds of Householder transformations: (1) Householder rotations to achieve superior capacity of modeling relation patterns; (2) Householder projections to handle sophisticated relation mapping properties. Theoretically, HousE is capable of modeling crucial relation patterns and mapping properties simultaneously. Besides, HousE is a generalization of existing rotation-based models while extending the rotations to high-dimensional spaces. Empirically, HousE achieves new state-of-the-art performance on five benchmark datasets. Our code is available at https://github.com/anrep/HousE.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22ab/li22ab.pdf", "supp": "", "pdf_size": 467728, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15337285257575958816&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science and Technology, Dalian University of Technology, Dalian, China; University of Notre Dame, Indiana, USA; Microsoft Research Asia, Beijing, China; Peking University, Beijing, China; Michigan State University, Michigan, USA; Microsoft, Beijing, China; Central South University, Changsha, China; Microsoft, Beijing, China; Microsoft, Beijing, China; Department of Computer Science and Technology, Dalian University of Technology, Dalian, China; Microsoft Research Asia, Beijing, China; Microsoft, Beijing, China", "aff_domain": "dlut.edu.cn; ;microsoft.com; ; ; ; ; ; ; ; ;", "email": "dlut.edu.cn; ;microsoft.com; ; ; ; ; ; ; ; ;", "github": "https://github.com/anrep/HousE", "project": "", "author_num": 12, "oa": "https://proceedings.mlr.press/v162/li22ab.html", "aff_unique_index": "0;1;2;3;4;2;5;2;2;0;2;2", "aff_unique_norm": "Dalian University of Technology;University of Notre Dame;Microsoft;Peking University;Michigan State University;Central South University", "aff_unique_dep": "Department of Computer Science and Technology;;Research;;;", "aff_unique_url": "http://en.dlut.edu.cn/;https://www.nd.edu;https://www.microsoft.com/en-us/research/group/asia;http://www.pku.edu.cn;https://www.msu.edu;http://www.csu.edu.cn", "aff_unique_abbr": "DUT;Notre Dame;MSRA;Peking U;MSU;CSU", "aff_campus_unique_index": "0;1;2;2;3;2;4;2;2;0;2;2", "aff_campus_unique": "Dalian;Notre Dame;Beijing;Michigan;Changsha", "aff_country_unique_index": "0;1;0;0;1;0;0;0;0;0;0;0", "aff_country_unique": "China;United States" }, { "title": "How Faithful is your Synthetic Data? Sample-level Metrics for Evaluating and Auditing Generative Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16229", "id": "16229", "proceeding": "https://proceedings.mlr.press/v162/alaa22a.html", "poster": "/media/PosterPDFs/ICML%202022/d91caca74114d81fdfc578fca82f8d72.png?t=1657716661.4193552", "slides": "", "author_site": "Ahmed Alaa, Boris van Breugel, Evgeny S. Saveliev, Mihaela van der Schaar", "author": "Ahmed Alaa; Boris Van Breugel; Evgeny S. Saveliev; Mihaela van der Schaar", "abstract": "Devising domain- and model-agnostic evaluation metrics for generative models is an important and as yet unresolved problem. Most existing metrics, which were tailored solely to the image synthesis setup, exhibit a limited capacity for diagnosing the different modes of failure of generative models across broader application domains. In this paper, we introduce a 3-dimensional evaluation metric, ($\\alpha$-Precision, $\\beta$-Recall, Authenticity), that characterizes the fidelity, diversity and generalization performance of any generative model in a domain-agnostic fashion. Our metric unifies statistical divergence measures with precision-recall analysis, enabling sample- and distribution-level diagnoses of model fidelity and diversity. We introduce generalization as an additional, independent dimension (to the fidelity-diversity trade-off) that quantifies the extent to which a model copies training data{\u2014}a crucial performance indicator when modeling sensitive data with requirements on privacy. The three metric components correspond to (interpretable) probabilistic quantities, and are estimated via sample-level binary classification. The sample-level nature of our metric inspires a novel use case which we call model auditing, wherein we judge the quality of individual samples generated by a (black-box) model, discarding low-quality samples and hence improving the overall model performance in a post-hoc manner.", "bibtex": "@InProceedings{pmlr-v162-alaa22a,\n title = \t {How Faithful is your Synthetic Data? {S}ample-level Metrics for Evaluating and Auditing Generative Models},\n author = {Alaa, Ahmed and Van Breugel, Boris and Saveliev, Evgeny S. and van der Schaar, Mihaela},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {290--306},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/alaa22a/alaa22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/alaa22a.html},\n abstract = \t {Devising domain- and model-agnostic evaluation metrics for generative models is an important and as yet unresolved problem. Most existing metrics, which were tailored solely to the image synthesis setup, exhibit a limited capacity for diagnosing the different modes of failure of generative models across broader application domains. In this paper, we introduce a 3-dimensional evaluation metric, ($\\alpha$-Precision, $\\beta$-Recall, Authenticity), that characterizes the fidelity, diversity and generalization performance of any generative model in a domain-agnostic fashion. Our metric unifies statistical divergence measures with precision-recall analysis, enabling sample- and distribution-level diagnoses of model fidelity and diversity. We introduce generalization as an additional, independent dimension (to the fidelity-diversity trade-off) that quantifies the extent to which a model copies training data{\u2014}a crucial performance indicator when modeling sensitive data with requirements on privacy. The three metric components correspond to (interpretable) probabilistic quantities, and are estimated via sample-level binary classification. The sample-level nature of our metric inspires a novel use case which we call model auditing, wherein we judge the quality of individual samples generated by a (black-box) model, discarding low-quality samples and hence improving the overall model performance in a post-hoc manner.}\n}", "pdf": "https://proceedings.mlr.press/v162/alaa22a/alaa22a.pdf", "supp": "", "pdf_size": 3735403, "gs_citation": 271, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15840878488291944826&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Broad Institute of MIT and Harvard+MIT; Cambridge University; Cambridge University; UCLA+MIT", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/alaa22a.html", "aff_unique_index": "0+1;2;2;3+1", "aff_unique_norm": "Broad Institute;Massachusetts Institute of Technology;University of Cambridge;University of California, Los Angeles", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.broadinstitute.org;https://web.mit.edu;https://www.cam.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "Broad;MIT;Cambridge;UCLA", "aff_campus_unique_index": ";1;1;2", "aff_campus_unique": ";Cambridge;Los Angeles", "aff_country_unique_index": "0+0;1;1;0+0", "aff_country_unique": "United States;United Kingdom" }, { "title": "How Powerful are Spectral Graph Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17795", "id": "17795", "proceeding": "https://proceedings.mlr.press/v162/wang22am.html", "poster": "/media/PosterPDFs/ICML%202022/227f6afd3b7f89b96c4bb91f95d50f6d.png?t=1657711747.4053886", "slides": "", "author_site": "Xiyuan Wang, Muhan Zhang", "author": "Xiyuan Wang; Muhan Zhang", "abstract": "Spectral Graph Neural Network is a kind of Graph Neural Network (GNN) based on graph signal filters. Some models able to learn arbitrary spectral filters have emerged recently. However, few works analyze the expressive power of spectral GNNs. This paper studies spectral GNNs\u2019 expressive power theoretically. We first prove that even spectral GNNs without nonlinearity can produce arbitrary graph signals and give two conditions for reaching universality. They are: 1) no multiple eigenvalues of graph Laplacian, and 2) no missing frequency components in node features. We also establish a connection between the expressive power of spectral GNNs and Graph Isomorphism (GI) testing, the latter of which is often used to characterize spatial GNNs\u2019 expressive power. Moreover, we study the difference in empirical performance among different spectral GNNs with the same expressive power from an optimization perspective, and motivate the use of an orthogonal basis whose weight function corresponds to the graph signal density in the spectrum. Inspired by the analysis, we propose JacobiConv, which uses Jacobi basis due to its orthogonality and flexibility to adapt to a wide range of weight functions. JacobiConv deserts nonlinearity while outperforming all baselines on both synthetic and real-world datasets.", "bibtex": "@InProceedings{pmlr-v162-wang22am,\n title = \t {How Powerful are Spectral Graph Neural Networks},\n author = {Wang, Xiyuan and Zhang, Muhan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23341--23362},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22am/wang22am.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22am.html},\n abstract = \t {Spectral Graph Neural Network is a kind of Graph Neural Network (GNN) based on graph signal filters. Some models able to learn arbitrary spectral filters have emerged recently. However, few works analyze the expressive power of spectral GNNs. This paper studies spectral GNNs\u2019 expressive power theoretically. We first prove that even spectral GNNs without nonlinearity can produce arbitrary graph signals and give two conditions for reaching universality. They are: 1) no multiple eigenvalues of graph Laplacian, and 2) no missing frequency components in node features. We also establish a connection between the expressive power of spectral GNNs and Graph Isomorphism (GI) testing, the latter of which is often used to characterize spatial GNNs\u2019 expressive power. Moreover, we study the difference in empirical performance among different spectral GNNs with the same expressive power from an optimization perspective, and motivate the use of an orthogonal basis whose weight function corresponds to the graph signal density in the spectrum. Inspired by the analysis, we propose JacobiConv, which uses Jacobi basis due to its orthogonality and flexibility to adapt to a wide range of weight functions. JacobiConv deserts nonlinearity while outperforming all baselines on both synthetic and real-world datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22am/wang22am.pdf", "supp": "", "pdf_size": 679921, "gs_citation": 316, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17960766448265380456&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Institute for Artificial Intelligence, Peking University + Beijing Institute for General Artificial Intelligence; Institute for Artificial Intelligence, Peking University + Beijing Institute for General Artificial Intelligence", "aff_domain": "pku.edu.cn;pku.edu.cn", "email": "pku.edu.cn;pku.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/wang22am.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Peking University;Beijing Institute for General Artificial Intelligence", "aff_unique_dep": "Institute for Artificial Intelligence;", "aff_unique_url": "http://www.pku.edu.cn;http://www.bigaiai.org/", "aff_unique_abbr": "PKU;BIGAI", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "China" }, { "title": "How Tempering Fixes Data Augmentation in Bayesian Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17401", "id": "17401", "proceeding": "https://proceedings.mlr.press/v162/bachmann22a.html", "poster": "/media/PosterPDFs/ICML%202022/8091588a3968da46e3e43a76bf3b3a98.png?t=1658243102.363145", "slides": "", "author_site": "Gregor Bachmann, Lorenzo Noci, Thomas Hofmann", "author": "Gregor Bachmann; Lorenzo Noci; Thomas Hofmann", "abstract": "While Bayesian neural networks (BNNs) provide a sound and principled alternative to standard neural networks, an artificial sharpening of the posterior usually needs to be applied to reach comparable performance. This is in stark contrast to theory, dictating that given an adequate prior and a well-specified model, the untempered Bayesian posterior should achieve optimal performance. Despite the community\u2019s extensive efforts, the observed gains in performance still remain disputed with several plausible causes pointing at its origin. While data augmentation has been empirically recognized as one of the main drivers of this effect, a theoretical account of its role, on the other hand, is largely missing. In this work we identify two interlaced factors concurrently influencing the strength of the cold posterior effect, namely the correlated nature of augmentations and the degree of invariance of the employed model to such transformations. By theoretically analyzing simplified settings, we prove that tempering implicitly reduces the misspecification arising from modeling augmentations as i.i.d. data. The temperature mimics the role of the effective sample size, reflecting the gain in information provided by the augmentations. We corroborate our theoretical findings with extensive empirical evaluations, scaling to realistic BNNs. By relying on the framework of group convolutions, we experiment with models of varying inherent degree of invariance, confirming its hypothesized relationship with the optimal temperature.", "bibtex": "@InProceedings{pmlr-v162-bachmann22a,\n title = \t {How Tempering Fixes Data Augmentation in {B}ayesian Neural Networks},\n author = {Bachmann, Gregor and Noci, Lorenzo and Hofmann, Thomas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1244--1260},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bachmann22a/bachmann22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bachmann22a.html},\n abstract = \t {While Bayesian neural networks (BNNs) provide a sound and principled alternative to standard neural networks, an artificial sharpening of the posterior usually needs to be applied to reach comparable performance. This is in stark contrast to theory, dictating that given an adequate prior and a well-specified model, the untempered Bayesian posterior should achieve optimal performance. Despite the community\u2019s extensive efforts, the observed gains in performance still remain disputed with several plausible causes pointing at its origin. While data augmentation has been empirically recognized as one of the main drivers of this effect, a theoretical account of its role, on the other hand, is largely missing. In this work we identify two interlaced factors concurrently influencing the strength of the cold posterior effect, namely the correlated nature of augmentations and the degree of invariance of the employed model to such transformations. By theoretically analyzing simplified settings, we prove that tempering implicitly reduces the misspecification arising from modeling augmentations as i.i.d. data. The temperature mimics the role of the effective sample size, reflecting the gain in information provided by the augmentations. We corroborate our theoretical findings with extensive empirical evaluations, scaling to realistic BNNs. By relying on the framework of group convolutions, we experiment with models of varying inherent degree of invariance, confirming its hypothesized relationship with the optimal temperature.}\n}", "pdf": "https://proceedings.mlr.press/v162/bachmann22a/bachmann22a.pdf", "supp": "", "pdf_size": 9149977, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11999528327264155822&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science, ETH Z\u00fcrich, Z\u00fcrich, Switzerland; Department of Computer Science, ETH Z\u00fcrich, Z\u00fcrich, Switzerland; Department of Computer Science, ETH Z\u00fcrich, Z\u00fcrich, Switzerland", "aff_domain": "inf.ethz.ch;inf.ethz.ch; ", "email": "inf.ethz.ch;inf.ethz.ch; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/bachmann22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Z\u00fcrich", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "How to Fill the Optimum Set? Population Gradient Descent with Harmless Diversity", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17595", "id": "17595", "proceeding": "https://proceedings.mlr.press/v162/gong22b.html", "poster": "", "slides": "", "author_site": "Chengyue Gong, , Qiang Liu", "author": "Chengyue Gong; Lemeng Wu; Qiang Liu", "abstract": "Although traditional optimization methods focus on finding a single optimal solution, most objective functions in modern machine learning problems, especially those in deep learning, often have multiple or infinite number of optimal points. Therefore, it is useful to consider the problem of finding a set of diverse points in the optimum set of an objective function. In this work, we frame this problem as a bi-level optimization problem of maximizing a diversity score inside the optimum set of the main loss function, and solve it with a simple population gradient descent framework that iteratively updates the points to maximize the diversity score in a fashion that does not hurt the optimization of the main loss. We demonstrate that our method can efficiently generate diverse solutions on multiple applications, e.g. text-to-image generation, text-to-mesh generation, molecular conformation generation and ensemble neural network training.", "bibtex": "@InProceedings{pmlr-v162-gong22b,\n title = \t {How to Fill the Optimum Set? {P}opulation Gradient Descent with Harmless Diversity},\n author = {Gong, Chengyue and Wu, Lemeng and Liu, Qiang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7650--7664},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gong22b/gong22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/gong22b.html},\n abstract = \t {Although traditional optimization methods focus on finding a single optimal solution, most objective functions in modern machine learning problems, especially those in deep learning, often have multiple or infinite number of optimal points. Therefore, it is useful to consider the problem of finding a set of diverse points in the optimum set of an objective function. In this work, we frame this problem as a bi-level optimization problem of maximizing a diversity score inside the optimum set of the main loss function, and solve it with a simple population gradient descent framework that iteratively updates the points to maximize the diversity score in a fashion that does not hurt the optimization of the main loss. We demonstrate that our method can efficiently generate diverse solutions on multiple applications, e.g. text-to-image generation, text-to-mesh generation, molecular conformation generation and ensemble neural network training.}\n}", "pdf": "https://proceedings.mlr.press/v162/gong22b/gong22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/gong22b-supp.zip", "pdf_size": 9858826, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14835260946884806648&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science, University of Texas at Austin; Department of Computer Science, University of Texas at Austin; Department of Computer Science, University of Texas at Austin", "aff_domain": "cs.utexas.edu;cs.utexas.edu; ", "email": "cs.utexas.edu;cs.utexas.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/gong22b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "How to Leverage Unlabeled Data in Offline Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17327", "id": "17327", "proceeding": "https://proceedings.mlr.press/v162/yu22c.html", "poster": "/media/PosterPDFs/ICML%202022/250dd56814ad7c50971ee4020519c6f5.png?t=1657861659.1246047", "slides": "", "author_site": "Tianhe (Kevin) Yu, Aviral Kumar, Yevgen Chebotar, Karol Hausman, Chelsea Finn, Sergey Levine", "author": "Tianhe Yu; Aviral Kumar; Yevgen Chebotar; Karol Hausman; Chelsea Finn; Sergey Levine", "abstract": "Offline reinforcement learning (RL) can learn control policies from static datasets but, like standard RL methods, it requires reward annotations for every transition. In many cases, labeling large datasets with rewards may be costly, especially if those rewards must be provided by human labelers, while collecting diverse unlabeled data might be comparatively inexpensive. How can we best leverage such unlabeled data in offline RL? One natural solution is to learn a reward function from the labeled data and use it to label the unlabeled data. In this paper, we find that, perhaps surprisingly, a much simpler method that simply applies zero rewards to unlabeled data leads to effective data sharing both in theory and in practice, without learning any reward model at all. While this approach might seem strange (and incorrect) at first, we provide extensive theoretical and empirical analysis that illustrates how it trades off reward bias, sample complexity and distributional shift, often leading to good results. We characterize conditions under which this simple strategy is effective, and further show that extending it with a simple reweighting approach can further alleviate the bias introduced by using incorrect reward labels. Our empirical evaluation confirms these findings in simulated robotic locomotion, navigation, and manipulation settings.", "bibtex": "@InProceedings{pmlr-v162-yu22c,\n title = \t {How to Leverage Unlabeled Data in Offline Reinforcement Learning},\n author = {Yu, Tianhe and Kumar, Aviral and Chebotar, Yevgen and Hausman, Karol and Finn, Chelsea and Levine, Sergey},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25611--25635},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yu22c/yu22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/yu22c.html},\n abstract = \t {Offline reinforcement learning (RL) can learn control policies from static datasets but, like standard RL methods, it requires reward annotations for every transition. In many cases, labeling large datasets with rewards may be costly, especially if those rewards must be provided by human labelers, while collecting diverse unlabeled data might be comparatively inexpensive. How can we best leverage such unlabeled data in offline RL? One natural solution is to learn a reward function from the labeled data and use it to label the unlabeled data. In this paper, we find that, perhaps surprisingly, a much simpler method that simply applies zero rewards to unlabeled data leads to effective data sharing both in theory and in practice, without learning any reward model at all. While this approach might seem strange (and incorrect) at first, we provide extensive theoretical and empirical analysis that illustrates how it trades off reward bias, sample complexity and distributional shift, often leading to good results. We characterize conditions under which this simple strategy is effective, and further show that extending it with a simple reweighting approach can further alleviate the bias introduced by using incorrect reward labels. Our empirical evaluation confirms these findings in simulated robotic locomotion, navigation, and manipulation settings.}\n}", "pdf": "https://proceedings.mlr.press/v162/yu22c/yu22c.pdf", "supp": "", "pdf_size": 2491324, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18094889945460536724&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Stanford University+Google Research; Google Research+UC Berkeley; Google Research; Google Research; Stanford University+Google Research; Google Research+UC Berkeley", "aff_domain": "cs.stanford.edu;berkeley.edu; ; ; ; ", "email": "cs.stanford.edu;berkeley.edu; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/yu22c.html", "aff_unique_index": "0+1;1+2;1;1;0+1;1+2", "aff_unique_norm": "Stanford University;Google;University of California, Berkeley", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.stanford.edu;https://research.google;https://www.berkeley.edu", "aff_unique_abbr": "Stanford;Google Research;UC Berkeley", "aff_campus_unique_index": "0+1;1+2;1;1;0+1;1+2", "aff_campus_unique": "Stanford;Mountain View;Berkeley", "aff_country_unique_index": "0+0;0+0;0;0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "How to Stay Curious while avoiding Noisy TVs using Aleatoric Uncertainty Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18191", "id": "18191", "proceeding": "https://proceedings.mlr.press/v162/mavor-parker22a.html", "poster": "", "slides": "", "author_site": "Augustine Mavor-Parker, Kimberly Young, Caswell Barry, Lewis Griffin", "author": "Augustine Mavor-Parker; Kimberly Young; Caswell Barry; Lewis Griffin", "abstract": "When extrinsic rewards are sparse, artificial agents struggle to explore an environment. Curiosity, implemented as an intrinsic reward for prediction errors, can improve exploration but it is known to fail when faced with action-dependent noise sources (\u2018noisy TVs\u2019). In an attempt to make exploring agents robust to Noisy TVs, we present a simple solution: aleatoric mapping agents (AMAs). AMAs are a novel form of curiosity that explicitly ascertain which state transitions of the environment are unpredictable, even if those dynamics are induced by the actions of the agent. This is achieved by generating separate forward predictions for the mean and aleatoric uncertainty of future states, with the aim of reducing intrinsic rewards for those transitions that are unpredictable. We demonstrate that in a range of environments AMAs are able to circumvent action-dependent stochastic traps that immobilise conventional curiosity driven agents. Furthermore, we demonstrate empirically that other common exploration approaches\u2014previously thought to be immune to agent-induced randomness\u2014can be trapped by stochastic dynamics.", "bibtex": "@InProceedings{pmlr-v162-mavor-parker22a,\n title = \t {How to Stay Curious while avoiding Noisy {TV}s using Aleatoric Uncertainty Estimation},\n author = {Mavor-Parker, Augustine and Young, Kimberly and Barry, Caswell and Griffin, Lewis},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15220--15240},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mavor-parker22a/mavor-parker22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mavor-parker22a.html},\n abstract = \t {When extrinsic rewards are sparse, artificial agents struggle to explore an environment. Curiosity, implemented as an intrinsic reward for prediction errors, can improve exploration but it is known to fail when faced with action-dependent noise sources (\u2018noisy TVs\u2019). In an attempt to make exploring agents robust to Noisy TVs, we present a simple solution: aleatoric mapping agents (AMAs). AMAs are a novel form of curiosity that explicitly ascertain which state transitions of the environment are unpredictable, even if those dynamics are induced by the actions of the agent. This is achieved by generating separate forward predictions for the mean and aleatoric uncertainty of future states, with the aim of reducing intrinsic rewards for those transitions that are unpredictable. We demonstrate that in a range of environments AMAs are able to circumvent action-dependent stochastic traps that immobilise conventional curiosity driven agents. Furthermore, we demonstrate empirically that other common exploration approaches\u2014previously thought to be immune to agent-induced randomness\u2014can be trapped by stochastic dynamics.}\n}", "pdf": "https://proceedings.mlr.press/v162/mavor-parker22a/mavor-parker22a.pdf", "supp": "", "pdf_size": 6758209, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17640958033353795331&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Centre for Artificial Intelligence, University College London, UK+Department of Cell and Developmental Biology, University College London, UK+Boston University, Center for Systems Neuroscience, Graduate Program for Neuroscience, USA; Department of Cell and Developmental Biology, University College London, UK; Department of Cell and Developmental Biology, University College London, UK; Department of Computer Science, University College London, UK", "aff_domain": "cs.ucl.ac.uk; ; ; ", "email": "cs.ucl.ac.uk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/mavor-parker22a.html", "aff_unique_index": "0+0+1;0;0;0", "aff_unique_norm": "University College London;Boston University", "aff_unique_dep": "Centre for Artificial Intelligence;Center for Systems Neuroscience, Graduate Program for Neuroscience", "aff_unique_url": "https://www.ucl.ac.uk;https://www.bu.edu", "aff_unique_abbr": "UCL;BU", "aff_campus_unique_index": "0", "aff_campus_unique": "London;", "aff_country_unique_index": "0+0+1;0;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "How to Steer Your Adversary: Targeted and Efficient Model Stealing Defenses with Gradient Redirection", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16921", "id": "16921", "proceeding": "https://proceedings.mlr.press/v162/mazeika22a.html", "poster": "/media/PosterPDFs/ICML%202022/324545ee1d35608f4932e6c10c5a2df4.png?t=1658210503.3810854", "slides": "", "author_site": "Mantas Mazeika, Bo Li, David Forsyth", "author": "Mantas Mazeika; Bo Li; David Forsyth", "abstract": "Model stealing attacks present a dilemma for public machine learning APIs. To protect financial investments, companies may be forced to withhold important information about their models that could facilitate theft, including uncertainty estimates and prediction explanations. This compromise is harmful not only to users but also to external transparency. Model stealing defenses seek to resolve this dilemma by making models harder to steal while preserving utility for benign users. However, existing defenses have poor performance in practice, either requiring enormous computational overheads or severe utility trade-offs. To meet these challenges, we present a new approach to model stealing defenses called gradient redirection. At the core of our approach is a provably optimal, efficient algorithm for steering an adversary\u2019s training updates in a targeted manner. Combined with improvements to surrogate networks and a novel coordinated defense strategy, our gradient redirection defense, called GRAD^2, achieves small utility trade-offs and low computational overhead, outperforming the best prior defenses. Moreover, we demonstrate how gradient redirection enables reprogramming the adversary with arbitrary behavior, which we hope will foster work on new avenues of defense.", "bibtex": "@InProceedings{pmlr-v162-mazeika22a,\n title = \t {How to Steer Your Adversary: Targeted and Efficient Model Stealing Defenses with Gradient Redirection},\n author = {Mazeika, Mantas and Li, Bo and Forsyth, David},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15241--15254},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mazeika22a/mazeika22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mazeika22a.html},\n abstract = \t {Model stealing attacks present a dilemma for public machine learning APIs. To protect financial investments, companies may be forced to withhold important information about their models that could facilitate theft, including uncertainty estimates and prediction explanations. This compromise is harmful not only to users but also to external transparency. Model stealing defenses seek to resolve this dilemma by making models harder to steal while preserving utility for benign users. However, existing defenses have poor performance in practice, either requiring enormous computational overheads or severe utility trade-offs. To meet these challenges, we present a new approach to model stealing defenses called gradient redirection. At the core of our approach is a provably optimal, efficient algorithm for steering an adversary\u2019s training updates in a targeted manner. Combined with improvements to surrogate networks and a novel coordinated defense strategy, our gradient redirection defense, called GRAD^2, achieves small utility trade-offs and low computational overhead, outperforming the best prior defenses. Moreover, we demonstrate how gradient redirection enables reprogramming the adversary with arbitrary behavior, which we hope will foster work on new avenues of defense.}\n}", "pdf": "https://proceedings.mlr.press/v162/mazeika22a/mazeika22a.pdf", "supp": "", "pdf_size": 1035282, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12763327756240287958&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "UIUC; UIUC; UIUC", "aff_domain": "illinois.edu; ; ", "email": "illinois.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/mazeika22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://www illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "How to Train Your Wide Neural Network Without Backprop: An Input-Weight Alignment Perspective", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16399", "id": "16399", "proceeding": "https://proceedings.mlr.press/v162/boopathy22a.html", "poster": "", "slides": "", "author_site": "Akhilan Boopathy, Ila R. Fiete", "author": "Akhilan Boopathy; Ila Fiete", "abstract": "Recent works have examined theoretical and empirical properties of wide neural networks trained in the Neural Tangent Kernel (NTK) regime. Given that biological neural networks are much wider than their artificial counterparts, we consider NTK regime wide neural networks as a possible model of biological neural networks. Leveraging NTK theory, we show theoretically that gradient descent drives layerwise weight updates that are aligned with their input activity correlations weighted by error, and demonstrate empirically that the result also holds in finite-width wide networks. The alignment result allows us to formulate a family of biologically-motivated, backpropagation-free learning rules that are theoretically equivalent to backpropagation in infinite-width networks. We test these learning rules on benchmark problems in feedforward and recurrent neural networks and demonstrate, in wide networks, comparable performance to backpropagation. The proposed rules are particularly effective in low data regimes, which are common in biological learning settings.", "bibtex": "@InProceedings{pmlr-v162-boopathy22a,\n title = \t {How to Train Your Wide Neural Network Without Backprop: An Input-Weight Alignment Perspective},\n author = {Boopathy, Akhilan and Fiete, Ila},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2178--2205},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/boopathy22a/boopathy22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/boopathy22a.html},\n abstract = \t {Recent works have examined theoretical and empirical properties of wide neural networks trained in the Neural Tangent Kernel (NTK) regime. Given that biological neural networks are much wider than their artificial counterparts, we consider NTK regime wide neural networks as a possible model of biological neural networks. Leveraging NTK theory, we show theoretically that gradient descent drives layerwise weight updates that are aligned with their input activity correlations weighted by error, and demonstrate empirically that the result also holds in finite-width wide networks. The alignment result allows us to formulate a family of biologically-motivated, backpropagation-free learning rules that are theoretically equivalent to backpropagation in infinite-width networks. We test these learning rules on benchmark problems in feedforward and recurrent neural networks and demonstrate, in wide networks, comparable performance to backpropagation. The proposed rules are particularly effective in low data regimes, which are common in biological learning settings.}\n}", "pdf": "https://proceedings.mlr.press/v162/boopathy22a/boopathy22a.pdf", "supp": "", "pdf_size": 1017750, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9130275033770297216&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Massachusetts Institute of Technology; Massachusetts Institute of Technology", "aff_domain": "mit.edu; ", "email": "mit.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/boopathy22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Human-in-the-loop: Provably Efficient Preference-based Reinforcement Learning with General Function Approximation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18371", "id": "18371", "proceeding": "https://proceedings.mlr.press/v162/chen22ag.html", "poster": "/media/PosterPDFs/ICML%202022/cf8c9be2a4508a24ae92c9d3d379131d.png?t=1657545397.5740213", "slides": "", "author_site": "Xiaoyu Chen, Han Zhong, Zhuoran Yang, Zhaoran Wang, Liwei Wang", "author": "Xiaoyu Chen; Han Zhong; Zhuoran Yang; Zhaoran Wang; Liwei Wang", "abstract": "We study human-in-the-loop reinforcement learning (RL) with trajectory preferences, where instead of receiving a numeric reward at each step, the RL agent only receives preferences over trajectory pairs from a human overseer. The goal of the RL agent is to learn the optimal policy which is most preferred by the human overseer. Despite the empirical success in various real-world applications, the theoretical understanding of preference-based RL (PbRL) is only limited to the tabular case. In this paper, we propose the first optimistic model-based algorithm for PbRL with general function approximation, which estimates the model using value-targeted regression and calculates the exploratory policies by solving an optimistic planning problem. We prove that our algorithm achieves the regret bound of $\\tilde{O} (\\operatorname{poly}(d H) \\sqrt{K} )$, where $d$ is the complexity measure of the transition and preference model depending on the Eluder dimension and log-covering numbers, $H$ is the planning horizon, $K$ is the number of episodes, and $\\tilde O(\\cdot)$ omits logarithmic terms. Our lower bound indicates that our algorithm is near-optimal when specialized to the linear setting. Furthermore, we extend the PbRL problem by formulating a novel problem called RL with $n$-wise comparisons, and provide the first sample-efficient algorithm for this new setting. To the best of our knowledge, this is the first theoretical result for PbRL with (general) function approximation.", "bibtex": "@InProceedings{pmlr-v162-chen22ag,\n title = \t {Human-in-the-loop: Provably Efficient Preference-based Reinforcement Learning with General Function Approximation},\n author = {Chen, Xiaoyu and Zhong, Han and Yang, Zhuoran and Wang, Zhaoran and Wang, Liwei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3773--3793},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22ag/chen22ag.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22ag.html},\n abstract = \t {We study human-in-the-loop reinforcement learning (RL) with trajectory preferences, where instead of receiving a numeric reward at each step, the RL agent only receives preferences over trajectory pairs from a human overseer. The goal of the RL agent is to learn the optimal policy which is most preferred by the human overseer. Despite the empirical success in various real-world applications, the theoretical understanding of preference-based RL (PbRL) is only limited to the tabular case. In this paper, we propose the first optimistic model-based algorithm for PbRL with general function approximation, which estimates the model using value-targeted regression and calculates the exploratory policies by solving an optimistic planning problem. We prove that our algorithm achieves the regret bound of $\\tilde{O} (\\operatorname{poly}(d H) \\sqrt{K} )$, where $d$ is the complexity measure of the transition and preference model depending on the Eluder dimension and log-covering numbers, $H$ is the planning horizon, $K$ is the number of episodes, and $\\tilde O(\\cdot)$ omits logarithmic terms. Our lower bound indicates that our algorithm is near-optimal when specialized to the linear setting. Furthermore, we extend the PbRL problem by formulating a novel problem called RL with $n$-wise comparisons, and provide the first sample-efficient algorithm for this new setting. To the best of our knowledge, this is the first theoretical result for PbRL with (general) function approximation.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22ag/chen22ag.pdf", "supp": "", "pdf_size": 384573, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15841297820738153724&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Key Laboratory of Machine Perception, MOE, School of Artificial Intelligence, Peking University+Center for Data Science, Peking University; Key Laboratory of Machine Perception, MOE, School of Artificial Intelligence, Peking University+Peng Cheng Laboratory; Department of Statistics and Data Science, Yale University; Department of Industrial Engineering and Management Sciences, Northwestern University; Key Laboratory of Machine Perception, MOE, School of Artificial Intelligence, Peking University+Center for Data Science, Peking University", "aff_domain": "pku.edu.cn;stu.pku.edu.cn;yale.edu;gmail.com;cis.pku.edu.cn", "email": "pku.edu.cn;stu.pku.edu.cn;yale.edu;gmail.com;cis.pku.edu.cn", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/chen22ag.html", "aff_unique_index": "0+0;0+1;2;3;0+0", "aff_unique_norm": "Peking University;Pengcheng Laboratory;Yale University;Northwestern University", "aff_unique_dep": "School of Artificial Intelligence;Peng Cheng Laboratory;Department of Statistics and Data Science;Department of Industrial Engineering and Management Sciences", "aff_unique_url": "http://www.pku.edu.cn;http://www.pcl.ac.cn;https://www.yale.edu;https://www.northwestern.edu", "aff_unique_abbr": "PKU;PCL;Yale;NU", "aff_campus_unique_index": "1;;1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0+0;0+0;1;1;0+0", "aff_country_unique": "China;United States" }, { "title": "HyperImpute: Generalized Iterative Imputation with Automatic Model Selection", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17571", "id": "17571", "proceeding": "https://proceedings.mlr.press/v162/jarrett22a.html", "poster": "/media/PosterPDFs/ICML%202022/4e62e752ae53fb6a6eebd0f6146aa702.png?t=1657488393.813953", "slides": "", "author_site": "Daniel Jarrett, Bogdan Cebere, Tennison Liu, Alicia Curth, Mihaela van der Schaar", "author": "Daniel Jarrett; Bogdan C Cebere; Tennison Liu; Alicia Curth; Mihaela van der Schaar", "abstract": "Consider the problem of imputing missing values in a dataset. One the one hand, conventional approaches using iterative imputation benefit from the simplicity and customizability of learning conditional distributions directly, but suffer from the practical requirement for appropriate model specification of each and every variable. On the other hand, recent methods using deep generative modeling benefit from the capacity and efficiency of learning with neural network function approximators, but are often difficult to optimize and rely on stronger data assumptions. In this work, we study an approach that marries the advantages of both: We propose *HyperImpute*, a generalized iterative imputation framework for adaptively and automatically configuring column-wise models and their hyperparameters. Practically, we provide a concrete implementation with out-of-the-box learners, optimizers, simulators, and extensible interfaces. Empirically, we investigate this framework via comprehensive experiments and sensitivities on a variety of public datasets, and demonstrate its ability to generate accurate imputations relative to a strong suite of benchmarks. Contrary to recent work, we believe our findings constitute a strong defense of the iterative imputation paradigm.", "bibtex": "@InProceedings{pmlr-v162-jarrett22a,\n title = \t {{H}yper{I}mpute: Generalized Iterative Imputation with Automatic Model Selection},\n author = {Jarrett, Daniel and Cebere, Bogdan C and Liu, Tennison and Curth, Alicia and van der Schaar, Mihaela},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9916--9937},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jarrett22a/jarrett22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jarrett22a.html},\n abstract = \t {Consider the problem of imputing missing values in a dataset. One the one hand, conventional approaches using iterative imputation benefit from the simplicity and customizability of learning conditional distributions directly, but suffer from the practical requirement for appropriate model specification of each and every variable. On the other hand, recent methods using deep generative modeling benefit from the capacity and efficiency of learning with neural network function approximators, but are often difficult to optimize and rely on stronger data assumptions. In this work, we study an approach that marries the advantages of both: We propose *HyperImpute*, a generalized iterative imputation framework for adaptively and automatically configuring column-wise models and their hyperparameters. Practically, we provide a concrete implementation with out-of-the-box learners, optimizers, simulators, and extensible interfaces. Empirically, we investigate this framework via comprehensive experiments and sensitivities on a variety of public datasets, and demonstrate its ability to generate accurate imputations relative to a strong suite of benchmarks. Contrary to recent work, we believe our findings constitute a strong defense of the iterative imputation paradigm.}\n}", "pdf": "https://proceedings.mlr.press/v162/jarrett22a/jarrett22a.pdf", "supp": "", "pdf_size": 1631572, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7345905181972151816&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Applied Mathematics & Theoretical Physics, University of Cambridge, UK; Department of Applied Mathematics & Theoretical Physics, University of Cambridge, UK; Department of Applied Mathematics & Theoretical Physics, University of Cambridge, UK; Department of Applied Mathematics & Theoretical Physics, University of Cambridge, UK; Department of Applied Mathematics & Theoretical Physics, University of Cambridge, UK + Department of Electrical Engineering, University of California, Los Angeles, USA", "aff_domain": "cam.ac.uk; ; ; ; ", "email": "cam.ac.uk; ; ; ; ", "github": "https://github.com/vanderschaarlab/hyperimpute", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/jarrett22a.html", "aff_unique_index": "0;0;0;0;0+1", "aff_unique_norm": "University of Cambridge;University of California, Los Angeles", "aff_unique_dep": "Department of Applied Mathematics & Theoretical Physics;Department of Electrical Engineering", "aff_unique_url": "https://www.cam.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "Cambridge;UCLA", "aff_campus_unique_index": "0;0;0;0;0+1", "aff_campus_unique": "Cambridge;Los Angeles", "aff_country_unique_index": "0;0;0;0;0+1", "aff_country_unique": "United Kingdom;United States" }, { "title": "HyperPrompt: Prompt-based Task-Conditioning of Transformers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16499", "id": "16499", "proceeding": "https://proceedings.mlr.press/v162/he22f.html", "poster": "/media/PosterPDFs/ICML%202022/c2c701fe341a7756ca7fd4eaa83ff63f.png?t=1657221739.2662492", "slides": "", "author_site": "Yun He, Steven Zheng, Yi Tay, Jai Gupta, Yu Du, Vamsi Aribandi, Zhe Zhao, Yaguang Li, Zhao Chen, Don Metzler, Heng-Tze Cheng, Ed Chi", "author": "Yun He; Steven Zheng; Yi Tay; Jai Gupta; Yu Du; Vamsi Aribandi; Zhe Zhao; Yaguang Li; Zhao Chen; Donald Metzler; Heng-Tze Cheng; Ed H. Chi", "abstract": "Prompt-Tuning is a new paradigm for finetuning pre-trained language models in a parameter efficient way. Here, we explore the use of HyperNetworks to generate hyper-prompts: we propose HyperPrompt, a novel architecture for prompt-based task-conditioning of self-attention in Transformers. The hyper-prompts are end-to-end learnable via generation by a HyperNetwork. HyperPrompt allows the network to learn task-specific feature maps where the hyper-prompts serve as task global memories for the queries to attend to, at the same time enabling flexible information sharing among tasks. We show that HyperPrompt is competitive against strong multi-task learning baselines with as few as 0.14% of additional task-conditioning parameters, achieving great parameter and computational efficiency. Through extensive empirical experiments, we demonstrate that HyperPrompt can achieve superior performances over strong T5 multi-task learning baselines and parameter-efficient adapter variants including Prompt-Tuning and HyperFormer++ on Natural Language Understanding benchmarks of GLUE and SuperGLUE across many model sizes.", "bibtex": "@InProceedings{pmlr-v162-he22f,\n title = \t {{H}yper{P}rompt: Prompt-based Task-Conditioning of Transformers},\n author = {He, Yun and Zheng, Steven and Tay, Yi and Gupta, Jai and Du, Yu and Aribandi, Vamsi and Zhao, Zhe and Li, Yaguang and Chen, Zhao and Metzler, Donald and Cheng, Heng-Tze and Chi, Ed H.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8678--8690},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/he22f/he22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/he22f.html},\n abstract = \t {Prompt-Tuning is a new paradigm for finetuning pre-trained language models in a parameter efficient way. Here, we explore the use of HyperNetworks to generate hyper-prompts: we propose HyperPrompt, a novel architecture for prompt-based task-conditioning of self-attention in Transformers. The hyper-prompts are end-to-end learnable via generation by a HyperNetwork. HyperPrompt allows the network to learn task-specific feature maps where the hyper-prompts serve as task global memories for the queries to attend to, at the same time enabling flexible information sharing among tasks. We show that HyperPrompt is competitive against strong multi-task learning baselines with as few as 0.14% of additional task-conditioning parameters, achieving great parameter and computational efficiency. Through extensive empirical experiments, we demonstrate that HyperPrompt can achieve superior performances over strong T5 multi-task learning baselines and parameter-efficient adapter variants including Prompt-Tuning and HyperFormer++ on Natural Language Understanding benchmarks of GLUE and SuperGLUE across many model sizes.}\n}", "pdf": "https://proceedings.mlr.press/v162/he22f/he22f.pdf", "supp": "", "pdf_size": 411379, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18179737290550246602&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Texas A&M University + Google; Google; Google; Google; Google; Google; Google; Google; Waymo LLC; Google; Google; Google", "aff_domain": "google.com;google.com; ; ; ; ; ; ; ; ; ;", "email": "google.com;google.com; ; ; ; ; ; ; ; ; ;", "github": "", "project": "", "author_num": 12, "oa": "https://proceedings.mlr.press/v162/he22f.html", "aff_unique_index": "0+1;1;1;1;1;1;1;1;2;1;1;1", "aff_unique_norm": "Texas A&M University;Google;Waymo", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.tamu.edu;https://www.google.com;https://www.waymo.com", "aff_unique_abbr": "TAMU;Google;Waymo", "aff_campus_unique_index": "1;1;1;1;1;1;1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "HyperTransformer: Model Generation for Supervised and Semi-Supervised Few-Shot Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17687", "id": "17687", "proceeding": "https://proceedings.mlr.press/v162/zhmoginov22a.html", "poster": "/media/PosterPDFs/ICML%202022/e32cc80bf07915058ce90722ee17bb71_1kzLPwX.png?t=1657491886.2261405", "slides": "", "author_site": "Andrey Zhmoginov, Mark Sandler, Maksym Vladymyrov", "author": "Andrey Zhmoginov; Mark Sandler; Maksym Vladymyrov", "abstract": "In this work we propose a HyperTransformer, a Transformer-based model for supervised and semi-supervised few-shot learning that generates weights of a convolutional neural network (CNN) directly from support samples. Since the dependence of a small generated CNN model on a specific task is encoded by a high-capacity Transformer model, we effectively decouple the complexity of the large task space from the complexity of individual tasks. Our method is particularly effective for small target CNN architectures where learning a fixed universal task-independent embedding is not optimal and better performance is attained when the information about the task can modulate all model parameters. For larger models we discover that generating the last layer alone allows us to produce competitive or better results than those obtained with state-of-the-art methods while being end-to-end differentiable.", "bibtex": "@InProceedings{pmlr-v162-zhmoginov22a,\n title = \t {{H}yper{T}ransformer: Model Generation for Supervised and Semi-Supervised Few-Shot Learning},\n author = {Zhmoginov, Andrey and Sandler, Mark and Vladymyrov, Maksym},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27075--27098},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhmoginov22a/zhmoginov22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhmoginov22a.html},\n abstract = \t {In this work we propose a HyperTransformer, a Transformer-based model for supervised and semi-supervised few-shot learning that generates weights of a convolutional neural network (CNN) directly from support samples. Since the dependence of a small generated CNN model on a specific task is encoded by a high-capacity Transformer model, we effectively decouple the complexity of the large task space from the complexity of individual tasks. Our method is particularly effective for small target CNN architectures where learning a fixed universal task-independent embedding is not optimal and better performance is attained when the information about the task can modulate all model parameters. For larger models we discover that generating the last layer alone allows us to produce competitive or better results than those obtained with state-of-the-art methods while being end-to-end differentiable.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhmoginov22a/zhmoginov22a.pdf", "supp": "", "pdf_size": 4875896, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10377617896492947178&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Google Research; Google Research; Google Research", "aff_domain": "google.com; ; ", "email": "google.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhmoginov22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "IDYNO: Learning Nonparametric DAGs from Interventional Dynamic Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17259", "id": "17259", "proceeding": "https://proceedings.mlr.press/v162/gao22a.html", "poster": "/media/PosterPDFs/ICML%202022/d77e68596c15c53c2a33ad143739902d.png?t=1658047848.6455147", "slides": "/media/icml-2022/Slides/17259.pdf", "author_site": "Tian Gao, DEBARUN BHATTACHARJYA, Elliot Nelson, Miao Liu, Yue Yu", "author": "Tian Gao; Debarun Bhattacharjya; Elliot Nelson; Miao Liu; Yue Yu", "abstract": "Causal discovery in the form of a directed acyclic graph (DAG) for time series data has been widely studied in various domains. The resulting DAG typically represents a dynamic Bayesian network (DBN), capturing both the instantaneous and time-delayed relationships among variables of interest. We propose a new algorithm, IDYNO, to learn the DAG structure from potentially nonlinear times series data by using a continuous optimization framework that includes a recent formulation for continuous acyclicity constraint. The proposed algorithm is designed to handle both observational and interventional time series data. We demonstrate the promising performance of our method on synthetic benchmark datasets against state-of-the-art baselines. In addition, we show that the proposed method can more accurately learn the underlying structure of a sequential decision model, such as a Markov decision process, with a fixed policy in typical continuous control tasks.", "bibtex": "@InProceedings{pmlr-v162-gao22a,\n title = \t {{IDYNO}: Learning Nonparametric {DAG}s from Interventional Dynamic Data},\n author = {Gao, Tian and Bhattacharjya, Debarun and Nelson, Elliot and Liu, Miao and Yu, Yue},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6988--7001},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gao22a/gao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gao22a.html},\n abstract = \t {Causal discovery in the form of a directed acyclic graph (DAG) for time series data has been widely studied in various domains. The resulting DAG typically represents a dynamic Bayesian network (DBN), capturing both the instantaneous and time-delayed relationships among variables of interest. We propose a new algorithm, IDYNO, to learn the DAG structure from potentially nonlinear times series data by using a continuous optimization framework that includes a recent formulation for continuous acyclicity constraint. The proposed algorithm is designed to handle both observational and interventional time series data. We demonstrate the promising performance of our method on synthetic benchmark datasets against state-of-the-art baselines. In addition, we show that the proposed method can more accurately learn the underlying structure of a sequential decision model, such as a Markov decision process, with a fixed policy in typical continuous control tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/gao22a/gao22a.pdf", "supp": "", "pdf_size": 534373, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12640208981230843912&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "IBM Research, Yorktown Heights, NY, USA; IBM Research, Yorktown Heights, NY, USA; IBM Research, Yorktown Heights, NY, USA; IBM Research, Yorktown Heights, NY, USA; Department of Mathematics, Lehigh University, Bethlehem, PA, USA", "aff_domain": "us.ibm.com; ; ; ; ", "email": "us.ibm.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/gao22a.html", "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "IBM;Lehigh University", "aff_unique_dep": "IBM Research;Department of Mathematics", "aff_unique_url": "https://www.ibm.com/research;https://www.lehigh.edu", "aff_unique_abbr": "IBM;Lehigh", "aff_campus_unique_index": "0;0;0;0;1", "aff_campus_unique": "Yorktown Heights;Bethlehem", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "IGLUE: A Benchmark for Transfer Learning across Modalities, Tasks, and Languages", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16295", "id": "16295", "proceeding": "https://proceedings.mlr.press/v162/bugliarello22a.html", "poster": "/media/PosterPDFs/ICML%202022/a600bd172fcabd688500dac58ebda3a0.png?t=1657607801.3174446", "slides": "/media/icml-2022/Slides/16295.pdf", "author_site": "Emanuele Bugliarello, Fangyu Liu, Jonas Pfeiffer, Siva Reddy, Desmond Elliott, Edoardo Maria Ponti, Ivan Vuli\u0107", "author": "Emanuele Bugliarello; Fangyu Liu; Jonas Pfeiffer; Siva Reddy; Desmond Elliott; Edoardo Maria Ponti; Ivan Vuli\u0107", "abstract": "Reliable evaluation benchmarks designed for replicability and comprehensiveness have driven progress in machine learning. Due to the lack of a multilingual benchmark, however, vision-and-language research has mostly focused on English language tasks. To fill this gap, we introduce the Image-Grounded Language Understanding Evaluation benchmark. IGLUE brings together{\u2014}by both aggregating pre-existing datasets and creating new ones{\u2014}visual question answering, cross-modal retrieval, grounded reasoning, and grounded entailment tasks across 20 diverse languages. Our benchmark enables the evaluation of multilingual multimodal models for transfer learning, not only in a zero-shot setting, but also in newly defined few-shot learning setups. Based on the evaluation of the available state-of-the-art models, we find that translate-test transfer is superior to zero-shot transfer and that few-shot learning is hard to harness for many tasks. Moreover, downstream performance is partially explained by the amount of available unlabelled textual data for pretraining, and only weakly by the typological distance of target{\u2013}source languages. We hope to encourage future research efforts in this area by releasing the benchmark to the community.", "bibtex": "@InProceedings{pmlr-v162-bugliarello22a,\n title = \t {{IGLUE}: A Benchmark for Transfer Learning across Modalities, Tasks, and Languages},\n author = {Bugliarello, Emanuele and Liu, Fangyu and Pfeiffer, Jonas and Reddy, Siva and Elliott, Desmond and Ponti, Edoardo Maria and Vuli{\\'c}, Ivan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2370--2392},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bugliarello22a.html},\n abstract = \t {Reliable evaluation benchmarks designed for replicability and comprehensiveness have driven progress in machine learning. Due to the lack of a multilingual benchmark, however, vision-and-language research has mostly focused on English language tasks. To fill this gap, we introduce the Image-Grounded Language Understanding Evaluation benchmark. IGLUE brings together{\u2014}by both aggregating pre-existing datasets and creating new ones{\u2014}visual question answering, cross-modal retrieval, grounded reasoning, and grounded entailment tasks across 20 diverse languages. Our benchmark enables the evaluation of multilingual multimodal models for transfer learning, not only in a zero-shot setting, but also in newly defined few-shot learning setups. Based on the evaluation of the available state-of-the-art models, we find that translate-test transfer is superior to zero-shot transfer and that few-shot learning is hard to harness for many tasks. Moreover, downstream performance is partially explained by the amount of available unlabelled textual data for pretraining, and only weakly by the typological distance of target{\u2013}source languages. We hope to encourage future research efforts in this area by releasing the benchmark to the community.}\n}", "pdf": "https://proceedings.mlr.press/v162/bugliarello22a/bugliarello22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/bugliarello22a-supp.zip", "pdf_size": 1881891, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16518489029064052823&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/bugliarello22a.html" }, { "title": "Identifiability Conditions for Domain Adaptation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17353", "id": "17353", "proceeding": "https://proceedings.mlr.press/v162/gulrajani22a.html", "poster": "/media/PosterPDFs/ICML%202022/9718db12cae6be37f7349779007ee589.png?t=1657873709.7479548", "slides": "", "author_site": "Ishaan Gulrajani, Tatsunori Hashimoto", "author": "Ishaan Gulrajani; Tatsunori Hashimoto", "abstract": "Domain adaptation algorithms and theory have relied upon an assumption that the observed data uniquely specify the correct correspondence between the domains. Unfortunately, it is unclear under what conditions this identifiability assumption holds, even when restricting ourselves to the case where a correct bijective map between domains exists. We study this bijective domain mapping problem and provide several new sufficient conditions for the identifiability of linear domain maps. As a consequence of our analysis, we show that weak constraints on the third moment tensor suffice for identifiability, prove identifiability for common latent variable models such as topic models, and give a computationally tractable method for generating certificates for the identifiability of linear maps. Inspired by our certification method, we derive a new objective function for domain mapping that explicitly accounts for uncertainty over maps arising from unidentifiability. We demonstrate that our objective leads to improvements in uncertainty quantification and model performance estimation.", "bibtex": "@InProceedings{pmlr-v162-gulrajani22a,\n title = \t {Identifiability Conditions for Domain Adaptation},\n author = {Gulrajani, Ishaan and Hashimoto, Tatsunori},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7982--7997},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gulrajani22a/gulrajani22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gulrajani22a.html},\n abstract = \t {Domain adaptation algorithms and theory have relied upon an assumption that the observed data uniquely specify the correct correspondence between the domains. Unfortunately, it is unclear under what conditions this identifiability assumption holds, even when restricting ourselves to the case where a correct bijective map between domains exists. We study this bijective domain mapping problem and provide several new sufficient conditions for the identifiability of linear domain maps. As a consequence of our analysis, we show that weak constraints on the third moment tensor suffice for identifiability, prove identifiability for common latent variable models such as topic models, and give a computationally tractable method for generating certificates for the identifiability of linear maps. Inspired by our certification method, we derive a new objective function for domain mapping that explicitly accounts for uncertainty over maps arising from unidentifiability. We demonstrate that our objective leads to improvements in uncertainty quantification and model performance estimation.}\n}", "pdf": "https://proceedings.mlr.press/v162/gulrajani22a/gulrajani22a.pdf", "supp": "", "pdf_size": 895847, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10553312063568390823&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Stanford University; Stanford University", "aff_domain": "stanford.edu; ", "email": "stanford.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/gulrajani22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Identification of Linear Non-Gaussian Latent Hierarchical Structure", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16843", "id": "16843", "proceeding": "https://proceedings.mlr.press/v162/xie22a.html", "poster": "/media/PosterPDFs/ICML%202022/b147a61c1d07c1c999560f62add6dbc7.png?t=1656752954.021781", "slides": "", "author_site": "Feng Xie, Biwei Huang, Zhengming Chen, Yangbo He, zhi geng, Kun Zhang", "author": "Feng Xie; Biwei Huang; Zhengming Chen; Yangbo He; Zhi Geng; Kun Zhang", "abstract": "Traditional causal discovery methods mainly focus on estimating causal relations among measured variables, but in many real-world problems, such as questionnaire-based psychometric studies, measured variables are generated by latent variables that are causally related. Accordingly, this paper investigates the problem of discovering the hidden causal variables and estimating the causal structure, including both the causal relations among latent variables and those between latent and measured variables. We relax the frequently-used measurement assumption and allow the children of latent variables to be latent as well, and hence deal with a specific type of latent hierarchical causal structure. In particular, we define a minimal latent hierarchical structure and show that for linear non-Gaussian models with the minimal latent hierarchical structure, the whole structure is identifiable from only the measured variables. Moreover, we develop a principled method to identify the structure by testing for Generalized Independent Noise (GIN) conditions in specific ways. Experimental results on both synthetic and real-world data show the effectiveness of the proposed approach.", "bibtex": "@InProceedings{pmlr-v162-xie22a,\n title = \t {Identification of Linear Non-{G}aussian Latent Hierarchical Structure},\n author = {Xie, Feng and Huang, Biwei and Chen, Zhengming and He, Yangbo and Geng, Zhi and Zhang, Kun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24370--24387},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xie22a/xie22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/xie22a.html},\n abstract = \t {Traditional causal discovery methods mainly focus on estimating causal relations among measured variables, but in many real-world problems, such as questionnaire-based psychometric studies, measured variables are generated by latent variables that are causally related. Accordingly, this paper investigates the problem of discovering the hidden causal variables and estimating the causal structure, including both the causal relations among latent variables and those between latent and measured variables. We relax the frequently-used measurement assumption and allow the children of latent variables to be latent as well, and hence deal with a specific type of latent hierarchical causal structure. In particular, we define a minimal latent hierarchical structure and show that for linear non-Gaussian models with the minimal latent hierarchical structure, the whole structure is identifiable from only the measured variables. Moreover, we develop a principled method to identify the structure by testing for Generalized Independent Noise (GIN) conditions in specific ways. Experimental results on both synthetic and real-world data show the effectiveness of the proposed approach.}\n}", "pdf": "https://proceedings.mlr.press/v162/xie22a/xie22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/xie22a-supp.zip", "pdf_size": 496304, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2047224264641009647&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/xie22a.html" }, { "title": "Identity-Disentangled Adversarial Augmentation for Self-supervised Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17859", "id": "17859", "proceeding": "https://proceedings.mlr.press/v162/yang22s.html", "poster": "/media/PosterPDFs/ICML%202022/766d856ef1a6b02f93d894415e6bfa0e.png?t=1657552420.6052325", "slides": "", "author_site": "Kaiwen Yang, Tianyi Zhou, Xinmei Tian, Dacheng Tao", "author": "Kaiwen Yang; Tianyi Zhou; Xinmei Tian; Dacheng Tao", "abstract": "Data augmentation is critical to contrastive self-supervised learning, whose goal is to distinguish a sample\u2019s augmentations (positives) from other samples (negatives). However, strong augmentations may change the sample-identity of the positives, while weak augmentation produces easy positives/negatives leading to nearly-zero loss and ineffective learning. In this paper, we study a simple adversarial augmentation method that can modify training data to be hard positives/negatives without distorting the key information about their original identities. In particular, we decompose a sample $x$ to be its variational auto-encoder (VAE) reconstruction $G(x)$ plus the residual $R(x)=x-G(x)$, where $R(x)$ retains most identity-distinctive information due to an information-theoretic interpretation of the VAE objective. We then adversarially perturb $G(x)$ in the VAE\u2019s bottleneck space and adds it back to the original $R(x)$ as an augmentation, which is therefore sufficiently challenging for contrastive learning and meanwhile preserves the sample identity intact. We apply this \u201cidentity-disentangled adversarial augmentation (IDAA)\u201d to different self-supervised learning methods. On multiple benchmark datasets, IDAA consistently improves both their efficiency and generalization performance. We further show that IDAA learned on a dataset can be transferred to other datasets. Code is available at \\href{https://github.com/kai-wen-yang/IDAA}{https://github.com/kai-wen-yang/IDAA}.", "bibtex": "@InProceedings{pmlr-v162-yang22s,\n title = \t {Identity-Disentangled Adversarial Augmentation for Self-supervised Learning},\n author = {Yang, Kaiwen and Zhou, Tianyi and Tian, Xinmei and Tao, Dacheng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25364--25381},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22s/yang22s.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22s.html},\n abstract = \t {Data augmentation is critical to contrastive self-supervised learning, whose goal is to distinguish a sample\u2019s augmentations (positives) from other samples (negatives). However, strong augmentations may change the sample-identity of the positives, while weak augmentation produces easy positives/negatives leading to nearly-zero loss and ineffective learning. In this paper, we study a simple adversarial augmentation method that can modify training data to be hard positives/negatives without distorting the key information about their original identities. In particular, we decompose a sample $x$ to be its variational auto-encoder (VAE) reconstruction $G(x)$ plus the residual $R(x)=x-G(x)$, where $R(x)$ retains most identity-distinctive information due to an information-theoretic interpretation of the VAE objective. We then adversarially perturb $G(x)$ in the VAE\u2019s bottleneck space and adds it back to the original $R(x)$ as an augmentation, which is therefore sufficiently challenging for contrastive learning and meanwhile preserves the sample identity intact. We apply this \u201cidentity-disentangled adversarial augmentation (IDAA)\u201d to different self-supervised learning methods. On multiple benchmark datasets, IDAA consistently improves both their efficiency and generalization performance. We further show that IDAA learned on a dataset can be transferred to other datasets. Code is available at \\href{https://github.com/kai-wen-yang/IDAA}{https://github.com/kai-wen-yang/IDAA}.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22s/yang22s.pdf", "supp": "", "pdf_size": 1659368, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12382448474762580849&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "University of Science and Technology of China, Hefei, China; University of Washington, Seattle, USA+University of Maryland, College Park, USA; Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, China; JD Explore Academy, Beijing, China", "aff_domain": "ustc.edu.cn;uw.edu;ustc.edu.cn;gmail.com", "email": "ustc.edu.cn;uw.edu;ustc.edu.cn;gmail.com", "github": "https://github.com/kai-wen-yang/IDAA", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/yang22s.html", "aff_unique_index": "0;1+2;3;4", "aff_unique_norm": "University of Science and Technology of China;University of Washington;University of Maryland;Hefei Comprehensive National Science Center;JD", "aff_unique_dep": ";;;Institute of Artificial Intelligence;JD Explore Academy", "aff_unique_url": "http://www.ustc.edu.cn;https://www.washington.edu;https://www/umd.edu;;", "aff_unique_abbr": "USTC;UW;UMD;;", "aff_campus_unique_index": "0;1+2;0;3", "aff_campus_unique": "Hefei;Seattle;College Park;Beijing", "aff_country_unique_index": "0;1+1;0;0", "aff_country_unique": "China;United States" }, { "title": "Image-to-Image Regression with Distribution-Free Uncertainty Quantification and Applications in Imaging", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16833", "id": "16833", "proceeding": "https://proceedings.mlr.press/v162/angelopoulos22a.html", "poster": "/media/PosterPDFs/ICML%202022/42dab56861d81108ee356d037190c315_wd7CgSu.png?t=1657493460.2702668", "slides": "", "author_site": "Anastasios Angelopoulos, Amit Pal Kohli, Stephen Bates, Michael Jordan, Jitendra Malik, Thayer Alshaabi, Srigokul Upadhyayula, Yaniv Romano", "author": "Anastasios N Angelopoulos; Amit Pal Kohli; Stephen Bates; Michael Jordan; Jitendra Malik; Thayer Alshaabi; Srigokul Upadhyayula; Yaniv Romano", "abstract": "Image-to-image regression is an important learning task, used frequently in biological imaging. Current algorithms, however, do not generally offer statistical guarantees that protect against a model\u2019s mistakes and hallucinations. To address this, we develop uncertainty quantification techniques with rigorous statistical guarantees for image-to-image regression problems. In particular, we show how to derive uncertainty intervals around each pixel that are guaranteed to contain the true value with a user-specified confidence probability. Our methods work in conjunction with any base machine learning model, such as a neural network, and endow it with formal mathematical guarantees{\u2014}regardless of the true unknown data distribution or choice of model. Furthermore, they are simple to implement and computationally inexpensive. We evaluate our procedure on three image-to-image regression tasks: quantitative phase microscopy, accelerated magnetic resonance imaging, and super-resolution transmission electron microscopy of a Drosophila melanogaster brain.", "bibtex": "@InProceedings{pmlr-v162-angelopoulos22a,\n title = \t {Image-to-Image Regression with Distribution-Free Uncertainty Quantification and Applications in Imaging},\n author = {Angelopoulos, Anastasios N and Kohli, Amit Pal and Bates, Stephen and Jordan, Michael and Malik, Jitendra and Alshaabi, Thayer and Upadhyayula, Srigokul and Romano, Yaniv},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {717--730},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/angelopoulos22a/angelopoulos22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/angelopoulos22a.html},\n abstract = \t {Image-to-image regression is an important learning task, used frequently in biological imaging. Current algorithms, however, do not generally offer statistical guarantees that protect against a model\u2019s mistakes and hallucinations. To address this, we develop uncertainty quantification techniques with rigorous statistical guarantees for image-to-image regression problems. In particular, we show how to derive uncertainty intervals around each pixel that are guaranteed to contain the true value with a user-specified confidence probability. Our methods work in conjunction with any base machine learning model, such as a neural network, and endow it with formal mathematical guarantees{\u2014}regardless of the true unknown data distribution or choice of model. Furthermore, they are simple to implement and computationally inexpensive. We evaluate our procedure on three image-to-image regression tasks: quantitative phase microscopy, accelerated magnetic resonance imaging, and super-resolution transmission electron microscopy of a Drosophila melanogaster brain.}\n}", "pdf": "https://proceedings.mlr.press/v162/angelopoulos22a/angelopoulos22a.pdf", "supp": "", "pdf_size": 4002314, "gs_citation": 119, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3321497325155679298&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Electrical Engineering and Computer Science, University of California, Berkeley; Department of Electrical Engineering and Computer Science, University of California, Berkeley; Department of Electrical Engineering and Computer Science, University of California, Berkeley; Department of Electrical Engineering and Computer Science, University of California, Berkeley; Department of Electrical Engineering and Computer Science, University of California, Berkeley; Advanced Bioimaging Center, Department of Molecular and Cell Biology, University of California, Berkeley; Advanced Bioimaging Center, Department of Molecular and Cell Biology, University of California, Berkeley + Chan Zuckerberg Biohub, San Francisco, CA; Departments of Electrical and Computer Engineering and of Computer Science, Technion - Israel Institute of Technology", "aff_domain": "berkeley.edu;berkeley.edu; ; ; ; ; ; ", "email": "berkeley.edu;berkeley.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/angelopoulos22a.html", "aff_unique_index": "0;0;0;0;0;0;0+1;2", "aff_unique_norm": "University of California, Berkeley;Chan Zuckerberg Biohub;Technion - Israel Institute of Technology", "aff_unique_dep": "Department of Electrical Engineering and Computer Science;;Departments of Electrical and Computer Engineering and of Computer Science", "aff_unique_url": "https://www.berkeley.edu;https://www.chanzuckerberg.com/science/biohub;https://www.technion.ac.il", "aff_unique_abbr": "UC Berkeley;CZ Biohub;Technion", "aff_campus_unique_index": "0;0;0;0;0;0;0+1", "aff_campus_unique": "Berkeley;San Francisco;", "aff_country_unique_index": "0;0;0;0;0;0;0+0;1", "aff_country_unique": "United States;Israel" }, { "title": "Imitation Learning by Estimating Expertise of Demonstrators", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16733", "id": "16733", "proceeding": "https://proceedings.mlr.press/v162/beliaev22a.html", "poster": "/media/PosterPDFs/ICML%202022/211c1e0b83b9c69fa9c4bdede203c1e3.png?t=1656012538.3613389", "slides": "/media/icml-2022/Slides/16733.pdf", "author_site": "Mark Beliaev, Andy Shih, Stefano Ermon, Dorsa Sadigh, Ramtin Pedarsani", "author": "Mark Beliaev; Andy Shih; Stefano Ermon; Dorsa Sadigh; Ramtin Pedarsani", "abstract": "Many existing imitation learning datasets are collected from multiple demonstrators, each with different expertise at different parts of the environment. Yet, standard imitation learning algorithms typically treat all demonstrators as homogeneous, regardless of their expertise, absorbing the weaknesses of any suboptimal demonstrators. In this work, we show that unsupervised learning over demonstrator expertise can lead to a consistent boost in the performance of imitation learning algorithms. We develop and optimize a joint model over a learned policy and expertise levels of the demonstrators. This enables our model to learn from the optimal behavior and filter out the suboptimal behavior of each demonstrator. Our model learns a single policy that can outperform even the best demonstrator, and can be used to estimate the expertise of any demonstrator at any state. We illustrate our findings on real-robotic continuous control tasks from Robomimic and discrete environments such as MiniGrid and chess, out-performing competing methods in 21 out of 23 settings, with an average of 7% and up to 60% improvement in terms of the final reward.", "bibtex": "@InProceedings{pmlr-v162-beliaev22a,\n title = \t {Imitation Learning by Estimating Expertise of Demonstrators},\n author = {Beliaev, Mark and Shih, Andy and Ermon, Stefano and Sadigh, Dorsa and Pedarsani, Ramtin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1732--1748},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/beliaev22a/beliaev22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/beliaev22a.html},\n abstract = \t {Many existing imitation learning datasets are collected from multiple demonstrators, each with different expertise at different parts of the environment. Yet, standard imitation learning algorithms typically treat all demonstrators as homogeneous, regardless of their expertise, absorbing the weaknesses of any suboptimal demonstrators. In this work, we show that unsupervised learning over demonstrator expertise can lead to a consistent boost in the performance of imitation learning algorithms. We develop and optimize a joint model over a learned policy and expertise levels of the demonstrators. This enables our model to learn from the optimal behavior and filter out the suboptimal behavior of each demonstrator. Our model learns a single policy that can outperform even the best demonstrator, and can be used to estimate the expertise of any demonstrator at any state. We illustrate our findings on real-robotic continuous control tasks from Robomimic and discrete environments such as MiniGrid and chess, out-performing competing methods in 21 out of 23 settings, with an average of 7% and up to 60% improvement in terms of the final reward.}\n}", "pdf": "https://proceedings.mlr.press/v162/beliaev22a/beliaev22a.pdf", "supp": "", "pdf_size": 1107976, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13040919863635608534&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Electrical and Computer Engineering, University of California, Santa Barbara+Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Electrical and Computer Engineering, University of California, Santa Barbara", "aff_domain": "ucsb.edu;cs.stanford.edu; ; ; ", "email": "ucsb.edu;cs.stanford.edu; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/beliaev22a.html", "aff_unique_index": "0+1;1;1;1;0", "aff_unique_norm": "University of California, Santa Barbara;Stanford University", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Computer Science", "aff_unique_url": "https://www.ucsb.edu;https://www.stanford.edu", "aff_unique_abbr": "UCSB;Stanford", "aff_campus_unique_index": "0+1;1;1;1;0", "aff_campus_unique": "Santa Barbara;Stanford", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Implicit Bias of Linear Equivariant Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16543", "id": "16543", "proceeding": "https://proceedings.mlr.press/v162/lawrence22a.html", "poster": "/media/PosterPDFs/ICML%202022/5103c3584b063c431bd1268e9b5e76fb.png?t=1657752309.4048896", "slides": "", "author_site": "Hannah Lawrence, Bobak T Kiani, Kristian Georgiev, Andrew Dienes", "author": "Hannah Lawrence; Kristian Georgiev; Andrew Dienes; Bobak T. Kiani", "abstract": "Group equivariant convolutional neural networks (G-CNNs) are generalizations of convolutional neural networks (CNNs) which excel in a wide range of technical applications by explicitly encoding symmetries, such as rotations and permutations, in their architectures. Although the success of G-CNNs is driven by their explicit symmetry bias, a recent line of work has proposed that the implicit bias of training algorithms on particular architectures is key to understanding generalization for overparameterized neural nets. In this context, we show that L-layer full-width linear G-CNNs trained via gradient descent for binary classification converge to solutions with low-rank Fourier matrix coefficients, regularized by the 2/L-Schatten matrix norm. Our work strictly generalizes previous analysis on the implicit bias of linear CNNs to linear G-CNNs over all finite groups, including the challenging setting of non-commutative groups (such as permutations), as well as band-limited G-CNNs over infinite groups. We validate our theorems via experiments on a variety of groups, and empirically explore more realistic nonlinear networks, which locally capture similar regularization patterns. Finally, we provide intuitive interpretations of our Fourier space implicit regularization results in real space via uncertainty principles.", "bibtex": "@InProceedings{pmlr-v162-lawrence22a,\n title = \t {Implicit Bias of Linear Equivariant Networks},\n author = {Lawrence, Hannah and Georgiev, Kristian and Dienes, Andrew and Kiani, Bobak T.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12096--12125},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lawrence22a/lawrence22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lawrence22a.html},\n abstract = \t {Group equivariant convolutional neural networks (G-CNNs) are generalizations of convolutional neural networks (CNNs) which excel in a wide range of technical applications by explicitly encoding symmetries, such as rotations and permutations, in their architectures. Although the success of G-CNNs is driven by their explicit symmetry bias, a recent line of work has proposed that the implicit bias of training algorithms on particular architectures is key to understanding generalization for overparameterized neural nets. In this context, we show that L-layer full-width linear G-CNNs trained via gradient descent for binary classification converge to solutions with low-rank Fourier matrix coefficients, regularized by the 2/L-Schatten matrix norm. Our work strictly generalizes previous analysis on the implicit bias of linear CNNs to linear G-CNNs over all finite groups, including the challenging setting of non-commutative groups (such as permutations), as well as band-limited G-CNNs over infinite groups. We validate our theorems via experiments on a variety of groups, and empirically explore more realistic nonlinear networks, which locally capture similar regularization patterns. Finally, we provide intuitive interpretations of our Fourier space implicit regularization results in real space via uncertainty principles.}\n}", "pdf": "https://proceedings.mlr.press/v162/lawrence22a/lawrence22a.pdf", "supp": "", "pdf_size": 7330789, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5414336386133292832&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, MA 02139, USA; Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, MA 02139, USA; Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, MA 02139, USA; Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, MA 02139, USA", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lawrence22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Department of Electrical Engineering and Computer Science", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Implicit Bias of the Step Size in Linear Diagonal Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17797", "id": "17797", "proceeding": "https://proceedings.mlr.press/v162/nacson22a.html", "poster": "/media/PosterPDFs/ICML%202022/829083d7452626f6e64b96ec0b734811_sVwz0r7.png?t=1657808186.1246588", "slides": "", "author_site": "Mor Shpigel Nacson, Kavya Ravichandran, Nati Srebro, Daniel Soudry", "author": "Mor Shpigel Nacson; Kavya Ravichandran; Nathan Srebro; Daniel Soudry", "abstract": "Focusing on diagonal linear networks as a model for understanding the implicit bias in underdetermined models, we show how the gradient descent step size can have a large qualitative effect on the implicit bias, and thus on generalization ability. In particular, we show how using large step size for non-centered data can change the implicit bias from a \"kernel\" type behavior to a \"rich\" (sparsity-inducing) regime \u2014 even when gradient flow, studied in previous works, would not escape the \"kernel\" regime. We do so by using dynamic stability, proving that convergence to dynamically stable global minima entails a bound on some weighted $\\ell_1$-norm of the linear predictor, i.e. a \"rich\" regime. We prove this leads to good generalization in a sparse regression setting.", "bibtex": "@InProceedings{pmlr-v162-nacson22a,\n title = \t {Implicit Bias of the Step Size in Linear Diagonal Neural Networks},\n author = {Nacson, Mor Shpigel and Ravichandran, Kavya and Srebro, Nathan and Soudry, Daniel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16270--16295},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nacson22a/nacson22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nacson22a.html},\n abstract = \t {Focusing on diagonal linear networks as a model for understanding the implicit bias in underdetermined models, we show how the gradient descent step size can have a large qualitative effect on the implicit bias, and thus on generalization ability. In particular, we show how using large step size for non-centered data can change the implicit bias from a \"kernel\" type behavior to a \"rich\" (sparsity-inducing) regime \u2014 even when gradient flow, studied in previous works, would not escape the \"kernel\" regime. We do so by using dynamic stability, proving that convergence to dynamically stable global minima entails a bound on some weighted $\\ell_1$-norm of the linear predictor, i.e. a \"rich\" regime. We prove this leads to good generalization in a sparse regression setting.}\n}", "pdf": "https://proceedings.mlr.press/v162/nacson22a/nacson22a.pdf", "supp": "", "pdf_size": 613502, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2807016287517863475&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Technion, Israel; TTI Chicago, USA; TTI Chicago, USA; Technion, Israel", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/nacson22a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Technion - Israel Institute of Technology;Toyota Technological Institute at Chicago", "aff_unique_dep": ";", "aff_unique_url": "https://www.technion.ac.il/en/;https://tti-chicago.org", "aff_unique_abbr": "Technion;TTI Chicago", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "Israel;United States" }, { "title": "Implicit Regularization in Hierarchical Tensor Factorization and Deep Convolutional Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16017", "id": "16017", "proceeding": "https://proceedings.mlr.press/v162/razin22a.html", "poster": "/media/PosterPDFs/ICML%202022/f47d0ad31c4c49061b9e505593e3db98_ixm4IQk.png?t=1657521510.7777514", "slides": "/media/icml-2022/Slides/16017_WdAjiqH.pdf", "author_site": "Noam Razin, Asaf Maman, Nadav Cohen", "author": "Noam Razin; Asaf Maman; Nadav Cohen", "abstract": "In the pursuit of explaining implicit regularization in deep learning, prominent focus was given to matrix and tensor factorizations, which correspond to simplified neural networks. It was shown that these models exhibit an implicit tendency towards low matrix and tensor ranks, respectively. Drawing closer to practical deep learning, the current paper theoretically analyzes the implicit regularization in hierarchical tensor factorization, a model equivalent to certain deep convolutional neural networks. Through a dynamical systems lens, we overcome challenges associated with hierarchy, and establish implicit regularization towards low hierarchical tensor rank. This translates to an implicit regularization towards locality for the associated convolutional networks. Inspired by our theory, we design explicit regularization discouraging locality, and demonstrate its ability to improve the performance of modern convolutional networks on non-local tasks, in defiance of conventional wisdom by which architectural changes are needed. Our work highlights the potential of enhancing neural networks via theoretical analysis of their implicit regularization.", "bibtex": "@InProceedings{pmlr-v162-razin22a,\n title = \t {Implicit Regularization in Hierarchical Tensor Factorization and Deep Convolutional Neural Networks},\n author = {Razin, Noam and Maman, Asaf and Cohen, Nadav},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18422--18462},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/razin22a/razin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/razin22a.html},\n abstract = \t {In the pursuit of explaining implicit regularization in deep learning, prominent focus was given to matrix and tensor factorizations, which correspond to simplified neural networks. It was shown that these models exhibit an implicit tendency towards low matrix and tensor ranks, respectively. Drawing closer to practical deep learning, the current paper theoretically analyzes the implicit regularization in hierarchical tensor factorization, a model equivalent to certain deep convolutional neural networks. Through a dynamical systems lens, we overcome challenges associated with hierarchy, and establish implicit regularization towards low hierarchical tensor rank. This translates to an implicit regularization towards locality for the associated convolutional networks. Inspired by our theory, we design explicit regularization discouraging locality, and demonstrate its ability to improve the performance of modern convolutional networks on non-local tasks, in defiance of conventional wisdom by which architectural changes are needed. Our work highlights the potential of enhancing neural networks via theoretical analysis of their implicit regularization.}\n}", "pdf": "https://proceedings.mlr.press/v162/razin22a/razin22a.pdf", "supp": "", "pdf_size": 2075796, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12909622448171060632&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Blavatnik School of Computer Science, Tel Aviv University, Israel; Blavatnik School of Computer Science, Tel Aviv University, Israel; Blavatnik School of Computer Science, Tel Aviv University, Israel", "aff_domain": "mail.tau.ac.il; ; ", "email": "mail.tau.ac.il; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/razin22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Tel Aviv University", "aff_unique_dep": "Blavatnik School of Computer Science", "aff_unique_url": "https://www.tau.ac.il", "aff_unique_abbr": "TAU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Tel Aviv", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Israel" }, { "title": "Implicit Regularization with Polynomial Growth in Deep Tensor Factorization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18189", "id": "18189", "proceeding": "https://proceedings.mlr.press/v162/hariz22a.html", "poster": "/media/PosterPDFs/ICML%202022/6f0442558302a6ededff195daf67f79b_DPJG6KI.png?t=1657639805.694636", "slides": "/media/icml-2022/Slides/18189.pdf", "author_site": "Kais HARIZ, Hachem Kadri, Stephane Ayache, Maher Moakher, Thierry Artieres", "author": "Kais Hariz; Hachem Kadri; Stephane Ayache; Maher Moakher; Thierry Artieres", "abstract": "We study the implicit regularization effects of deep learning in tensor factorization. While implicit regularization in deep matrix and \u2019shallow\u2019 tensor factorization via linear and certain type of non-linear neural networks promotes low-rank solutions with at most quadratic growth, we show that its effect in deep tensor factorization grows polynomially with the depth of the network. This provides a remarkably faithful description of the observed experimental behaviour. Using numerical experiments, we demonstrate the benefits of this implicit regularization in yielding a more accurate estimation and better convergence properties.", "bibtex": "@InProceedings{pmlr-v162-hariz22a,\n title = \t {Implicit Regularization with Polynomial Growth in Deep Tensor Factorization},\n author = {Hariz, Kais and Kadri, Hachem and Ayache, Stephane and Moakher, Maher and Artieres, Thierry},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8484--8501},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hariz22a/hariz22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hariz22a.html},\n abstract = \t {We study the implicit regularization effects of deep learning in tensor factorization. While implicit regularization in deep matrix and \u2019shallow\u2019 tensor factorization via linear and certain type of non-linear neural networks promotes low-rank solutions with at most quadratic growth, we show that its effect in deep tensor factorization grows polynomially with the depth of the network. This provides a remarkably faithful description of the observed experimental behaviour. Using numerical experiments, we demonstrate the benefits of this implicit regularization in yielding a more accurate estimation and better convergence properties.}\n}", "pdf": "https://proceedings.mlr.press/v162/hariz22a/hariz22a.pdf", "supp": "", "pdf_size": 1606508, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14114279418660053459&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 12, "aff": "Aix Marseille University, CNRS, LIS, Marseille, France + Ecole Centrale de Marseille, Marseille, France; Aix Marseille University, CNRS, LIS, Marseille, France; Aix Marseille University, CNRS, LIS, Marseille, France; LAMSIN, National Engineering School of Tunis, University of Tunis El Manar, Tunis, Tunisia; Aix Marseille University, CNRS, LIS, Marseille, France + Ecole Centrale de Marseille, Marseille, France", "aff_domain": "univ-amu.fr;univ-amu.fr; ; ;univ-amu.fr", "email": "univ-amu.fr;univ-amu.fr; ; ;univ-amu.fr", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/hariz22a.html", "aff_unique_index": "0+1;0;0;2;0+1", "aff_unique_norm": "Aix Marseille University;Ecole Centrale de Marseille;University of Tunis El Manar", "aff_unique_dep": "CNRS, LIS;;National Engineering School of Tunis", "aff_unique_url": "https://www.univ-amu.fr;https://www.ecm.fr;", "aff_unique_abbr": "AMU;ECM;", "aff_campus_unique_index": "0+0;0;0;1;0+0", "aff_campus_unique": "Marseille;Tunis", "aff_country_unique_index": "0+0;0;0;1;0+0", "aff_country_unique": "France;Tunisia" }, { "title": "Importance Weighted Kernel Bayes\u2019 Rule", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17695", "id": "17695", "proceeding": "https://proceedings.mlr.press/v162/xu22a.html", "poster": "/media/PosterPDFs/ICML%202022/b24d21019de5e59da180f1661904f49a.png?t=1657591407.225098", "slides": "/media/icml-2022/Slides/17695.pdf", "author_site": "Liyuan Xu, Yutian Chen, Arnaud Doucet, Arthur Gretton", "author": "Liyuan Xu; Yutian Chen; Arnaud Doucet; Arthur Gretton", "abstract": "We study a nonparametric approach to Bayesian computation via feature means, where the expectation of prior features is updated to yield expected posterior features, based on regression from kernel or neural net features of the observations. All quantities involved in the Bayesian update are learned from observed data, making the method entirely model-free. The resulting algorithm is a novel instance of a kernel Bayes\u2019 rule (KBR). Our approach is based on importance weighting, which results in superior numerical stability to the existing approach to KBR, which requires operator inversion. We show the convergence of the estimator using a novel consistency analysis on the importance weighting estimator in the infinity norm. We evaluate our KBR on challenging synthetic benchmarks, including a filtering problem with a state-space model involving high dimensional image observations. The proposed method yields uniformly better empirical performance than the existing KBR, and competitive performance with other competing methods. We evaluate our KBR on challenging synthetic benchmarks, including a filtering problem with a state-space model involving high dimensional image observations. The proposed method yields uniformly better empirical performance than the existing KBR, and competitive performance with other competing methods.", "bibtex": "@InProceedings{pmlr-v162-xu22a,\n title = \t {Importance Weighted Kernel {B}ayes\u2019 Rule},\n author = {Xu, Liyuan and Chen, Yutian and Doucet, Arnaud and Gretton, Arthur},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24524--24538},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22a/xu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22a.html},\n abstract = \t {We study a nonparametric approach to Bayesian computation via feature means, where the expectation of prior features is updated to yield expected posterior features, based on regression from kernel or neural net features of the observations. All quantities involved in the Bayesian update are learned from observed data, making the method entirely model-free. The resulting algorithm is a novel instance of a kernel Bayes\u2019 rule (KBR). Our approach is based on importance weighting, which results in superior numerical stability to the existing approach to KBR, which requires operator inversion. We show the convergence of the estimator using a novel consistency analysis on the importance weighting estimator in the infinity norm. We evaluate our KBR on challenging synthetic benchmarks, including a filtering problem with a state-space model involving high dimensional image observations. The proposed method yields uniformly better empirical performance than the existing KBR, and competitive performance with other competing methods. We evaluate our KBR on challenging synthetic benchmarks, including a filtering problem with a state-space model involving high dimensional image observations. The proposed method yields uniformly better empirical performance than the existing KBR, and competitive performance with other competing methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22a/xu22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/xu22a-supp.zip", "pdf_size": 504697, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7642501651079263729&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Gatsby Unit; DeepMind; DeepMind; Gatsby Unit", "aff_domain": "ucl.ac.uk; ; ; ", "email": "ucl.ac.uk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/xu22a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Gatsby Unit;DeepMind", "aff_unique_dep": ";", "aff_unique_url": "https://www.gatsbyunit.com;https://deepmind.com", "aff_unique_abbr": ";DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Improve Single-Point Zeroth-Order Optimization Using High-Pass and Low-Pass Filters", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16791", "id": "16791", "proceeding": "https://proceedings.mlr.press/v162/chen22w.html", "poster": "", "slides": "", "author_site": "Xin Chen, Yujie Tang, Na Li", "author": "Xin Chen; Yujie Tang; Na Li", "abstract": "Single-point zeroth-order optimization (SZO) is useful in solving online black-box optimization and control problems in time-varying environments, as it queries the function value only once at each time step. However, the vanilla SZO method is known to suffer from a large estimation variance and slow convergence, which seriously limits its practical application. In this work, we borrow the idea of high-pass and low-pass filters from extremum seeking control (continuous-time version of SZO) and develop a novel SZO method called HLF-SZO by integrating these filters. It turns out that the high-pass filter coincides with the residual feedback method, and the low-pass filter can be interpreted as the momentum method. As a result, the proposed HLF-SZO achieves a much smaller variance and much faster convergence than the vanilla SZO method, and empirically outperforms the residual-feedback SZO method, which are verified via extensive numerical experiments.", "bibtex": "@InProceedings{pmlr-v162-chen22w,\n title = \t {Improve Single-Point Zeroth-Order Optimization Using High-Pass and Low-Pass Filters},\n author = {Chen, Xin and Tang, Yujie and Li, Na},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3603--3620},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22w/chen22w.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22w.html},\n abstract = \t {Single-point zeroth-order optimization (SZO) is useful in solving online black-box optimization and control problems in time-varying environments, as it queries the function value only once at each time step. However, the vanilla SZO method is known to suffer from a large estimation variance and slow convergence, which seriously limits its practical application. In this work, we borrow the idea of high-pass and low-pass filters from extremum seeking control (continuous-time version of SZO) and develop a novel SZO method called HLF-SZO by integrating these filters. It turns out that the high-pass filter coincides with the residual feedback method, and the low-pass filter can be interpreted as the momentum method. As a result, the proposed HLF-SZO achieves a much smaller variance and much faster convergence than the vanilla SZO method, and empirically outperforms the residual-feedback SZO method, which are verified via extensive numerical experiments.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22w/chen22w.pdf", "supp": "", "pdf_size": 1211852, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15259613935055448613&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "John A. Paulson School of Engineering and Applied Sciences, Harvard University, MA, US; John A. Paulson School of Engineering and Applied Sciences, Harvard University, MA, US; John A. Paulson School of Engineering and Applied Sciences, Harvard University, MA, US", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22w.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "John A. Paulson School of Engineering and Applied Sciences", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Improved Certified Defenses against Data Poisoning with (Deterministic) Finite Aggregation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16363", "id": "16363", "proceeding": "https://proceedings.mlr.press/v162/wang22m.html", "poster": "/media/PosterPDFs/ICML%202022/140f6969d5213fd0ece03148e62e461e.png?t=1657864777.2619133", "slides": "", "author_site": "Wenxiao Wang, Alexander Levine, Soheil Feizi", "author": "Wenxiao Wang; Alexander J Levine; Soheil Feizi", "abstract": "Data poisoning attacks aim at manipulating model behaviors through distorting training data. Previously, an aggregation-based certified defense, Deep Partition Aggregation (DPA), was proposed to mitigate this threat. DPA predicts through an aggregation of base classifiers trained on disjoint subsets of data, thus restricting its sensitivity to dataset distortions. In this work, we propose an improved certified defense against general poisoning attacks, namely Finite Aggregation. In contrast to DPA, which directly splits the training set into disjoint subsets, our method first splits the training set into smaller disjoint subsets and then combines duplicates of them to build larger (but not disjoint) subsets for training base classifiers. This reduces the worst-case impacts of poison samples and thus improves certified robustness bounds. In addition, we offer an alternative view of our method, bridging the designs of deterministic and stochastic aggregation-based certified defenses. Empirically, our proposed Finite Aggregation consistently improves certificates on MNIST, CIFAR-10, and GTSRB, boosting certified fractions by up to 3.05%, 3.87% and 4.77%, respectively, while keeping the same clean accuracies as DPA\u2019s, effectively establishing a new state of the art in (pointwise) certified robustness against data poisoning.", "bibtex": "@InProceedings{pmlr-v162-wang22m,\n title = \t {Improved Certified Defenses against Data Poisoning with ({D}eterministic) Finite Aggregation},\n author = {Wang, Wenxiao and Levine, Alexander J and Feizi, Soheil},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22769--22783},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22m/wang22m.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22m.html},\n abstract = \t {Data poisoning attacks aim at manipulating model behaviors through distorting training data. Previously, an aggregation-based certified defense, Deep Partition Aggregation (DPA), was proposed to mitigate this threat. DPA predicts through an aggregation of base classifiers trained on disjoint subsets of data, thus restricting its sensitivity to dataset distortions. In this work, we propose an improved certified defense against general poisoning attacks, namely Finite Aggregation. In contrast to DPA, which directly splits the training set into disjoint subsets, our method first splits the training set into smaller disjoint subsets and then combines duplicates of them to build larger (but not disjoint) subsets for training base classifiers. This reduces the worst-case impacts of poison samples and thus improves certified robustness bounds. In addition, we offer an alternative view of our method, bridging the designs of deterministic and stochastic aggregation-based certified defenses. Empirically, our proposed Finite Aggregation consistently improves certificates on MNIST, CIFAR-10, and GTSRB, boosting certified fractions by up to 3.05%, 3.87% and 4.77%, respectively, while keeping the same clean accuracies as DPA\u2019s, effectively establishing a new state of the art in (pointwise) certified robustness against data poisoning.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22m/wang22m.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wang22m-supp.zip", "pdf_size": 2010918, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13385935402210758494&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, University of Maryland, College Park, Maryland, USA; Department of Computer Science, University of Maryland, College Park, Maryland, USA; Department of Computer Science, University of Maryland, College Park, Maryland, USA", "aff_domain": "umd.edu; ; ", "email": "umd.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22m.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Maryland, College Park", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Improved Convergence Rates for Sparse Approximation Methods in Kernel-Based Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18031", "id": "18031", "proceeding": "https://proceedings.mlr.press/v162/vakili22a.html", "poster": "/media/PosterPDFs/ICML%202022/1068c6e4c8051cfd4e9ea8072e3189e2.png?t=1657719844.6347961", "slides": "/media/icml-2022/Slides/18031.pdf", "author_site": "Sattar Vakili, Jonathan Scarlett, Da-shan Shiu, Alberto Bernacchia", "author": "Sattar Vakili; Jonathan Scarlett; Da-Shan Shiu; Alberto Bernacchia", "abstract": "Kernel-based models such as kernel ridge regression and Gaussian processes are ubiquitous in machine learning applications for regression and optimization. It is well known that a major downside for kernel-based models is the high computational cost; given a dataset of $n$ samples, the cost grows as $\\mathcal{O}(n^3)$. Existing sparse approximation methods can yield a significant reduction in the computational cost, effectively reducing the actual cost down to as low as $\\mathcal{O}(n)$ in certain cases. Despite this remarkable empirical success, significant gaps remain in the existing results for the analytical bounds on the error due to approximation. In this work, we provide novel confidence intervals for the Nystr\u00f6m method and the sparse variational Gaussian process approximation method, which we establish using novel interpretations of the approximate (surrogate) posterior variance of the models. Our confidence intervals lead to improved performance bounds in both regression and optimization problems.", "bibtex": "@InProceedings{pmlr-v162-vakili22a,\n title = \t {Improved Convergence Rates for Sparse Approximation Methods in Kernel-Based Learning},\n author = {Vakili, Sattar and Scarlett, Jonathan and Shiu, Da-Shan and Bernacchia, Alberto},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21960--21983},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vakili22a/vakili22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/vakili22a.html},\n abstract = \t {Kernel-based models such as kernel ridge regression and Gaussian processes are ubiquitous in machine learning applications for regression and optimization. It is well known that a major downside for kernel-based models is the high computational cost; given a dataset of $n$ samples, the cost grows as $\\mathcal{O}(n^3)$. Existing sparse approximation methods can yield a significant reduction in the computational cost, effectively reducing the actual cost down to as low as $\\mathcal{O}(n)$ in certain cases. Despite this remarkable empirical success, significant gaps remain in the existing results for the analytical bounds on the error due to approximation. In this work, we provide novel confidence intervals for the Nystr\u00f6m method and the sparse variational Gaussian process approximation method, which we establish using novel interpretations of the approximate (surrogate) posterior variance of the models. Our confidence intervals lead to improved performance bounds in both regression and optimization problems.}\n}", "pdf": "https://proceedings.mlr.press/v162/vakili22a/vakili22a.pdf", "supp": "", "pdf_size": 399154, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18071426535727517621&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "MediaTek Research; National University of Singapore; MediaTek Research; MediaTek Research", "aff_domain": "mtkresearch.com; ; ; ", "email": "mtkresearch.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/vakili22a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "MediaTek Inc.;National University of Singapore", "aff_unique_dep": "Research;", "aff_unique_url": "https://www.mediatek.com/;https://www.nus.edu.sg", "aff_unique_abbr": "MediaTek;NUS", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "China;Singapore" }, { "title": "Improved No-Regret Algorithms for Stochastic Shortest Path with Linear MDP", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16969", "id": "16969", "proceeding": "https://proceedings.mlr.press/v162/chen22h.html", "poster": "/media/PosterPDFs/ICML%202022/6aed000af86a084f9cb0264161e29dd3.png?t=1657184343.0768006", "slides": "", "author_site": "Liyu Chen, Rahul Jain, Haipeng Luo", "author": "Liyu Chen; Rahul Jain; Haipeng Luo", "abstract": "We introduce two new no-regret algorithms for the stochastic shortest path (SSP) problem with a linear MDP that significantly improve over the only existing results of (Vial et al., 2021). Our first algorithm is computationally efficient and achieves a regret bound $O(\\sqrt{d^3B_{\\star}^2T_{\\star} K})$, where $d$ is the dimension of the feature space, $B_{\\star}$ and $T_{\\star}$ are upper bounds of the expected costs and hitting time of the optimal policy respectively, and $K$ is the number of episodes. The same algorithm with a slight modification also achieves logarithmic regret of order $O(\\frac{d^3B_{\\star}^4}{c_{\\min}^2\\text{\\rm gap}_{\\min} }\\ln^5\\frac{dB_{\\star} K}{c_{\\min}})$, where $\\text{\\rm gap}_{\\min}$ is the minimum sub-optimality gap and $c_{\\min}$ is the minimum cost over all state-action pairs. Our result is obtained by developing a simpler and improved analysis for the finite-horizon approximation of (Cohen et al., 2021) with a smaller approximation error, which might be of independent interest. On the other hand, using variance-aware confidence sets in a global optimization problem, our second algorithm is computationally inefficient but achieves the first \u201chorizon-free\u201d regret bound $O(d^{3.5}B_{\\star}\\sqrt{K})$ with no polynomial dependency on $T_{\\star}$ or $1/c_{\\min}$, almost matching the $\\Omega(dB_{\\star}\\sqrt{K})$ lower bound from (Min et al., 2021).", "bibtex": "@InProceedings{pmlr-v162-chen22h,\n title = \t {Improved No-Regret Algorithms for Stochastic Shortest Path with Linear {MDP}},\n author = {Chen, Liyu and Jain, Rahul and Luo, Haipeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3204--3245},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22h/chen22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22h.html},\n abstract = \t {We introduce two new no-regret algorithms for the stochastic shortest path (SSP) problem with a linear MDP that significantly improve over the only existing results of (Vial et al., 2021). Our first algorithm is computationally efficient and achieves a regret bound $O(\\sqrt{d^3B_{\\star}^2T_{\\star} K})$, where $d$ is the dimension of the feature space, $B_{\\star}$ and $T_{\\star}$ are upper bounds of the expected costs and hitting time of the optimal policy respectively, and $K$ is the number of episodes. The same algorithm with a slight modification also achieves logarithmic regret of order $O(\\frac{d^3B_{\\star}^4}{c_{\\min}^2\\text{\\rm gap}_{\\min} }\\ln^5\\frac{dB_{\\star} K}{c_{\\min}})$, where $\\text{\\rm gap}_{\\min}$ is the minimum sub-optimality gap and $c_{\\min}$ is the minimum cost over all state-action pairs. Our result is obtained by developing a simpler and improved analysis for the finite-horizon approximation of (Cohen et al., 2021) with a smaller approximation error, which might be of independent interest. On the other hand, using variance-aware confidence sets in a global optimization problem, our second algorithm is computationally inefficient but achieves the first \u201chorizon-free\u201d regret bound $O(d^{3.5}B_{\\star}\\sqrt{K})$ with no polynomial dependency on $T_{\\star}$ or $1/c_{\\min}$, almost matching the $\\Omega(dB_{\\star}\\sqrt{K})$ lower bound from (Min et al., 2021).}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22h/chen22h.pdf", "supp": "", "pdf_size": 561846, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1605171541469559093&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "University of Southern California; University of Southern California; University of Southern California", "aff_domain": "usc.edu; ; ", "email": "usc.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22h.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Improved Rates for Differentially Private Stochastic Convex Optimization with Heavy-Tailed Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16327", "id": "16327", "proceeding": "https://proceedings.mlr.press/v162/kamath22a.html", "poster": "/media/PosterPDFs/ICML%202022/b607aa5b2fd58dd860bfb55619389982.png?t=1658153693.5314956", "slides": "/media/icml-2022/Slides/16327.pdf", "author_site": "Gautam Kamath, Xingtu Liu, Huanyu Zhang", "author": "Gautam Kamath; Xingtu Liu; Huanyu Zhang", "abstract": "We study stochastic convex optimization with heavy-tailed data under the constraint of differential privacy (DP). Most prior work on this problem is restricted to the case where the loss function is Lipschitz. Instead, as introduced by Wang, Xiao, Devadas, and Xu\u00a0\\cite{WangXDX20}, we study general convex loss functions with the assumption that the distribution of gradients has bounded $k$-th moments. We provide improved upper bounds on the excess population risk under concentrated DP for convex and strongly convex loss functions. Along the way, we derive new algorithms for private mean estimation of heavy-tailed distributions, under both pure and concentrated DP. Finally, we prove nearly-matching lower bounds for private stochastic convex optimization with strongly convex losses and mean estimation, showing new separations between pure and concentrated DP.", "bibtex": "@InProceedings{pmlr-v162-kamath22a,\n title = \t {Improved Rates for Differentially Private Stochastic Convex Optimization with Heavy-Tailed Data},\n author = {Kamath, Gautam and Liu, Xingtu and Zhang, Huanyu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10633--10660},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kamath22a/kamath22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kamath22a.html},\n abstract = \t {We study stochastic convex optimization with heavy-tailed data under the constraint of differential privacy (DP). Most prior work on this problem is restricted to the case where the loss function is Lipschitz. Instead, as introduced by Wang, Xiao, Devadas, and Xu\u00a0\\cite{WangXDX20}, we study general convex loss functions with the assumption that the distribution of gradients has bounded $k$-th moments. We provide improved upper bounds on the excess population risk under concentrated DP for convex and strongly convex loss functions. Along the way, we derive new algorithms for private mean estimation of heavy-tailed distributions, under both pure and concentrated DP. Finally, we prove nearly-matching lower bounds for private stochastic convex optimization with strongly convex losses and mean estimation, showing new separations between pure and concentrated DP.}\n}", "pdf": "https://proceedings.mlr.press/v162/kamath22a/kamath22a.pdf", "supp": "", "pdf_size": 435488, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14340776038385040745&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Cheriton School of Computer Science, University of Waterloo; Cheriton School of Computer Science, University of Waterloo; Meta", "aff_domain": "uwaterloo.ca;uwaterloo.ca;fb.com", "email": "uwaterloo.ca;uwaterloo.ca;fb.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kamath22a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Waterloo;Meta", "aff_unique_dep": "Cheriton School of Computer Science;Meta Platforms, Inc.", "aff_unique_url": "https://uwaterloo.ca;https://meta.com", "aff_unique_abbr": "UWaterloo;Meta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Waterloo;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Canada;United States" }, { "title": "Improved Regret for Differentially Private Exploration in Linear MDP", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16759", "id": "16759", "proceeding": "https://proceedings.mlr.press/v162/ngo22a.html", "poster": "/media/PosterPDFs/ICML%202022/007ff380ee5ac49ffc34442f5c2a2b86.png?t=1657646879.4748278", "slides": "", "author_site": "Dung Ngo, Giuseppe Vietri, Steven Wu", "author": "Dung Daniel T Ngo; Giuseppe Vietri; Steven Wu", "abstract": "We study privacy-preserving exploration in sequential decision-making for environments that rely on sensitive data such as medical records. In particular, we focus on solving the problem of reinforcement learning (RL) subject to the constraint of (joint) differential privacy in the linear MDP setting, where both dynamics and rewards are given by linear functions. Prior work on this problem due to (Luyo et al., 2021) achieves a regret rate that has a dependence of O(K^{3/5}) on the number of episodes K. We provide a private algorithm with an improved regret rate with an optimal dependence of O($\\sqrt{}$K) on the number of episodes. The key recipe for our stronger regret guarantee is the adaptivity in the policy update schedule, in which an update only occurs when sufficient changes in the data are detected. As a result, our algorithm benefits from low switching cost and only performs O(log(K)) updates, which greatly reduces the amount of privacy noise. Finally, in the most prevalent privacy regimes where the privacy parameter \\epsilon is a constant, our algorithm incurs negligible privacy cost{\u2014}in comparison with the existing non-private regret bounds, the additional regret due to privacy appears in lower-order terms.", "bibtex": "@InProceedings{pmlr-v162-ngo22a,\n title = \t {Improved Regret for Differentially Private Exploration in Linear {MDP}},\n author = {Ngo, Dung Daniel T and Vietri, Giuseppe and Wu, Steven},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16529--16552},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ngo22a/ngo22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ngo22a.html},\n abstract = \t {We study privacy-preserving exploration in sequential decision-making for environments that rely on sensitive data such as medical records. In particular, we focus on solving the problem of reinforcement learning (RL) subject to the constraint of (joint) differential privacy in the linear MDP setting, where both dynamics and rewards are given by linear functions. Prior work on this problem due to (Luyo et al., 2021) achieves a regret rate that has a dependence of O(K^{3/5}) on the number of episodes K. We provide a private algorithm with an improved regret rate with an optimal dependence of O($\\sqrt{}$K) on the number of episodes. The key recipe for our stronger regret guarantee is the adaptivity in the policy update schedule, in which an update only occurs when sufficient changes in the data are detected. As a result, our algorithm benefits from low switching cost and only performs O(log(K)) updates, which greatly reduces the amount of privacy noise. Finally, in the most prevalent privacy regimes where the privacy parameter \\epsilon is a constant, our algorithm incurs negligible privacy cost{\u2014}in comparison with the existing non-private regret bounds, the additional regret due to privacy appears in lower-order terms.}\n}", "pdf": "https://proceedings.mlr.press/v162/ngo22a/ngo22a.pdf", "supp": "", "pdf_size": 435511, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4569680233457922610&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science & Engineering, University of Minnesota, Minneapolis, MN 55455, USA; Department of Computer Science & Engineering, University of Minnesota, Minneapolis, MN 55455, USA; School of Computer Science, Carnegie Mellon University, Pittsburgh, PA 15213, USA", "aff_domain": "umn.edu;umn.edu;cmu.edu", "email": "umn.edu;umn.edu;cmu.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/ngo22a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Minnesota;Carnegie Mellon University", "aff_unique_dep": "Department of Computer Science & Engineering;School of Computer Science", "aff_unique_url": "https://www.umn.edu;https://www.cmu.edu", "aff_unique_abbr": "UMN;CMU", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Minneapolis;Pittsburgh", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Improved StyleGAN-v2 based Inversion for Out-of-Distribution Images", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17161", "id": "17161", "proceeding": "https://proceedings.mlr.press/v162/subramanyam22a.html", "poster": "/media/PosterPDFs/ICML%202022/5e2b66750529d8ae895ad2591118466f.png?t=1657954833.9180124", "slides": "", "author_site": "Rakshith Subramanyam, Vivek Narayanaswamy, Mark Naufel, Andreas Spanias, Jayaraman J. Thiagarajan", "author": "Rakshith Subramanyam; Vivek Narayanaswamy; Mark Naufel; Andreas Spanias; Jayaraman J. Thiagarajan", "abstract": "Inverting an image onto the latent space of pre-trained generators, e.g., StyleGAN-v2, has emerged as a popular strategy to leverage strong image priors for ill-posed restoration. Several studies have showed that this approach is effective at inverting images similar to the data used for training. However, with out-of-distribution (OOD) data that the generator has not been exposed to, existing inversion techniques produce sub-optimal results. In this paper, we propose SPHInX (StyleGAN with Projection Heads for Inverting X), an approach for accurately embedding OOD images onto the StyleGAN latent space. SPHInX optimizes a style projection head using a novel training strategy that imposes a vicinal regularization in the StyleGAN latent space. To further enhance OOD inversion, SPHInX can additionally optimize a content projection head and noise variables in every layer. Our empirical studies on a suite of OOD data show that, in addition to producing higher quality reconstructions over the state-of-the-art inversion techniques, SPHInX is effective for ill-posed restoration tasks while offering semantic editing capabilities.", "bibtex": "@InProceedings{pmlr-v162-subramanyam22a,\n title = \t {Improved {S}tyle{GAN}-v2 based Inversion for Out-of-Distribution Images},\n author = {Subramanyam, Rakshith and Narayanaswamy, Vivek and Naufel, Mark and Spanias, Andreas and Thiagarajan, Jayaraman J.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20625--20639},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/subramanyam22a/subramanyam22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/subramanyam22a.html},\n abstract = \t {Inverting an image onto the latent space of pre-trained generators, e.g., StyleGAN-v2, has emerged as a popular strategy to leverage strong image priors for ill-posed restoration. Several studies have showed that this approach is effective at inverting images similar to the data used for training. However, with out-of-distribution (OOD) data that the generator has not been exposed to, existing inversion techniques produce sub-optimal results. In this paper, we propose SPHInX (StyleGAN with Projection Heads for Inverting X), an approach for accurately embedding OOD images onto the StyleGAN latent space. SPHInX optimizes a style projection head using a novel training strategy that imposes a vicinal regularization in the StyleGAN latent space. To further enhance OOD inversion, SPHInX can additionally optimize a content projection head and noise variables in every layer. Our empirical studies on a suite of OOD data show that, in addition to producing higher quality reconstructions over the state-of-the-art inversion techniques, SPHInX is effective for ill-posed restoration tasks while offering semantic editing capabilities.}\n}", "pdf": "https://proceedings.mlr.press/v162/subramanyam22a/subramanyam22a.pdf", "supp": "", "pdf_size": 26969112, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17630542421274159814&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Arizona State University; Arizona State University; Arizona State University; Arizona State University; Lawrence Livermore National Laboratories", "aff_domain": "asu.edu; ; ; ; ", "email": "asu.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/subramanyam22a.html", "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Arizona State University;Lawrence Livermore National Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.asu.edu;https://www.llnl.gov", "aff_unique_abbr": "ASU;LLNL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Improving Adversarial Robustness via Mutual Information Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17669", "id": "17669", "proceeding": "https://proceedings.mlr.press/v162/zhou22j.html", "poster": "/media/PosterPDFs/ICML%202022/d902c3ce47124c66ce615d5ad9ba304f_O4kgXQA.png?t=1657167635.9478061", "slides": "", "author_site": "Dawei Zhou, Nannan Wang, Xinbo Gao, Bo Han, Xiaoyu Wang, Yibing Zhan, Tongliang Liu", "author": "Dawei Zhou; Nannan Wang; Xinbo Gao; Bo Han; Xiaoyu Wang; Yibing Zhan; Tongliang Liu", "abstract": "Deep neural networks (DNNs) are found to be vulnerable to adversarial noise. They are typically misled by adversarial samples to make wrong predictions. To alleviate this negative effect, in this paper, we investigate the dependence between outputs of the target model and input adversarial samples from the perspective of information theory, and propose an adversarial defense method. Specifically, we first measure the dependence by estimating the mutual information (MI) between outputs and the natural patterns of inputs (called natural MI) and MI between outputs and the adversarial patterns of inputs (called adversarial MI), respectively. We find that adversarial samples usually have larger adversarial MI and smaller natural MI compared with those w.r.t. natural samples. Motivated by this observation, we propose to enhance the adversarial robustness by maximizing the natural MI and minimizing the adversarial MI during the training process. In this way, the target model is expected to pay more attention to the natural pattern that contains objective semantics. Empirical evaluations demonstrate that our method could effectively improve the adversarial accuracy against multiple attacks.", "bibtex": "@InProceedings{pmlr-v162-zhou22j,\n title = \t {Improving Adversarial Robustness via Mutual Information Estimation},\n author = {Zhou, Dawei and Wang, Nannan and Gao, Xinbo and Han, Bo and Wang, Xiaoyu and Zhan, Yibing and Liu, Tongliang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27338--27352},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22j/zhou22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22j.html},\n abstract = \t {Deep neural networks (DNNs) are found to be vulnerable to adversarial noise. They are typically misled by adversarial samples to make wrong predictions. To alleviate this negative effect, in this paper, we investigate the dependence between outputs of the target model and input adversarial samples from the perspective of information theory, and propose an adversarial defense method. Specifically, we first measure the dependence by estimating the mutual information (MI) between outputs and the natural patterns of inputs (called natural MI) and MI between outputs and the adversarial patterns of inputs (called adversarial MI), respectively. We find that adversarial samples usually have larger adversarial MI and smaller natural MI compared with those w.r.t. natural samples. Motivated by this observation, we propose to enhance the adversarial robustness by maximizing the natural MI and minimizing the adversarial MI during the training process. In this way, the target model is expected to pay more attention to the natural pattern that contains objective semantics. Empirical evaluations demonstrate that our method could effectively improve the adversarial accuracy against multiple attacks.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22j/zhou22j.pdf", "supp": "", "pdf_size": 981500, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18303472399739418322&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "State Key Laboratory of Integrated Services Networks, School of Telecommunications Engineering, Xidian University; State Key Laboratory of Integrated Services Networks, School of Telecommunications Engineering, Xidian University; Chongqing Key Laboratory of Image Cognition, Chongqing University of Posts and Telecommunications; Department of Computer Science, Hong Kong Baptist University; The Chinese University of Hong Kong (Shenzhen); JD Explore Academy; TML Lab, Sydney AI Centre, The University of Sydney", "aff_domain": "xidian.edu.cn; ; ; ; ; ; ", "email": "xidian.edu.cn; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/zhou22j.html", "aff_unique_index": "0;0;1;2;3;4;5", "aff_unique_norm": "Xidian University;Chongqing University of Posts and Telecommunications;Hong Kong Baptist University;Chinese University of Hong Kong;JD;University of Sydney", "aff_unique_dep": "School of Telecommunications Engineering;Chongqing Key Laboratory of Image Cognition;Department of Computer Science;;JD Explore Academy;TML Lab", "aff_unique_url": "http://www.xidian.edu.cn/;;https://www.hkbu.edu.hk;https://www.cuhk.edu.cn;;https://www.sydney.edu.au", "aff_unique_abbr": "Xidian;;HKBU;CUHK;;USYD", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Hong Kong SAR;Shenzhen;Sydney", "aff_country_unique_index": "0;0;0;0;0;2", "aff_country_unique": "China;;Australia" }, { "title": "Improving Ensemble Distillation With Weight Averaging and Diversifying Perturbation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17837", "id": "17837", "proceeding": "https://proceedings.mlr.press/v162/nam22a.html", "poster": "/media/PosterPDFs/ICML%202022/a1d4c20b182ad7137ab3606f0e3fc8a4.png?t=1657451381.877927", "slides": "", "author_site": "Giung Nam, Hyungi Lee, Byeongho Heo, Juho Lee", "author": "Giung Nam; Hyungi Lee; Byeongho Heo; Juho Lee", "abstract": "Ensembles of deep neural networks have demonstrated superior performance, but their heavy computational cost hinders applying them for resource-limited environments. It motivates distilling knowledge from the ensemble teacher into a smaller student network, and there are two important design choices for this ensemble distillation: 1) how to construct the student network, and 2) what data should be shown during training. In this paper, we propose a weight averaging technique where a student with multiple subnetworks is trained to absorb the functional diversity of ensemble teachers, but then those subnetworks are properly averaged for inference, giving a single student network with no additional inference cost. We also propose a perturbation strategy that seeks inputs from which the diversities of teachers can be better transferred to the student. Combining these two, our method significantly improves upon previous methods on various image classification tasks.", "bibtex": "@InProceedings{pmlr-v162-nam22a,\n title = \t {Improving Ensemble Distillation With Weight Averaging and Diversifying Perturbation},\n author = {Nam, Giung and Lee, Hyungi and Heo, Byeongho and Lee, Juho},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16353--16367},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nam22a/nam22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nam22a.html},\n abstract = \t {Ensembles of deep neural networks have demonstrated superior performance, but their heavy computational cost hinders applying them for resource-limited environments. It motivates distilling knowledge from the ensemble teacher into a smaller student network, and there are two important design choices for this ensemble distillation: 1) how to construct the student network, and 2) what data should be shown during training. In this paper, we propose a weight averaging technique where a student with multiple subnetworks is trained to absorb the functional diversity of ensemble teachers, but then those subnetworks are properly averaged for inference, giving a single student network with no additional inference cost. We also propose a perturbation strategy that seeks inputs from which the diversities of teachers can be better transferred to the student. Combining these two, our method significantly improves upon previous methods on various image classification tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/nam22a/nam22a.pdf", "supp": "", "pdf_size": 7188217, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15634605277253421377&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Korea Advanced Institute of Science and Technology, Daejeon, Korea; Korea Advanced Institute of Science and Technology, Daejeon, Korea; Naver, Korea; Korea Advanced Institute of Science and Technology, Daejeon, Korea + AITRICS, Seoul, South Korea", "aff_domain": "kaist.ac.kr; ; ;kaist.ac.kr", "email": "kaist.ac.kr; ; ;kaist.ac.kr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/nam22a.html", "aff_unique_index": "0;0;1;0+2", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;NAVER Corporation;AITRICS", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.naver.com;", "aff_unique_abbr": "KAIST;Naver;", "aff_campus_unique_index": "0;0;0+2", "aff_campus_unique": "Daejeon;;Seoul", "aff_country_unique_index": "0;0;0;0+0", "aff_country_unique": "South Korea" }, { "title": "Improving Language Models by Retrieving from Trillions of Tokens", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17479", "id": "17479", "proceeding": "https://proceedings.mlr.press/v162/borgeaud22a.html", "poster": "/media/PosterPDFs/ICML%202022/d89a66c7c80a29b1bdbab0f2a1a94af8.png?t=1657905493.6845987", "slides": "", "author_site": "Sebastian Borgeaud, Arthur Mensch, Jordan Hoffmann, Trevor Cai, Eliza Rutherford, Katie Millican, George van den Driessche, Jean-Baptiste Lespiau, Bogdan Damoc, Aidan Clark, Diego de Las Casas, Aurelia Guy, Jacob Menick, Roman Ring, Tom Hennigan, Saffron Huang, Loren Maggiore, Chris Jones, Albin Cassirer, Andy Brock, Michela Paganini, Geoffrey Irving, Oriol Vinyals, Simon Osindero, Karen Simonyan, Jack Rae, Erich Elsen, Laurent Sifre", "author": "Sebastian Borgeaud; Arthur Mensch; Jordan Hoffmann; Trevor Cai; Eliza Rutherford; Katie Millican; George Bm Van Den Driessche; Jean-Baptiste Lespiau; Bogdan Damoc; Aidan Clark; Diego De Las Casas; Aurelia Guy; Jacob Menick; Roman Ring; Tom Hennigan; Saffron Huang; Loren Maggiore; Chris Jones; Albin Cassirer; Andy Brock; Michela Paganini; Geoffrey Irving; Oriol Vinyals; Simon Osindero; Karen Simonyan; Jack Rae; Erich Elsen; Laurent Sifre", "abstract": "We enhance auto-regressive language models by conditioning on document chunks retrieved from a large corpus, based on local similarity with preceding tokens. With a 2 trillion token database, our Retrieval-Enhanced Transformer (RETRO) obtains comparable performance to GPT-3 and Jurassic-1 on the Pile, despite using 25{\\texttimes} fewer parameters. After fine-tuning, RETRO performance translates to downstream knowledge-intensive tasks such as question answering. RETRO combines a frozen Bert retriever, a differentiable encoder and a chunked cross-attention mechanism to predict tokens based on an order of magnitude more data than what is typically consumed during training. We typically train RETRO from scratch, yet can also rapidly RETROfit pre-trained transformers with retrieval and still achieve good performance. Our work opens up new avenues for improving language models through explicit memory at unprecedented scale.", "bibtex": "@InProceedings{pmlr-v162-borgeaud22a,\n title = \t {Improving Language Models by Retrieving from Trillions of Tokens},\n author = {Borgeaud, Sebastian and Mensch, Arthur and Hoffmann, Jordan and Cai, Trevor and Rutherford, Eliza and Millican, Katie and Van Den Driessche, George Bm and Lespiau, Jean-Baptiste and Damoc, Bogdan and Clark, Aidan and De Las Casas, Diego and Guy, Aurelia and Menick, Jacob and Ring, Roman and Hennigan, Tom and Huang, Saffron and Maggiore, Loren and Jones, Chris and Cassirer, Albin and Brock, Andy and Paganini, Michela and Irving, Geoffrey and Vinyals, Oriol and Osindero, Simon and Simonyan, Karen and Rae, Jack and Elsen, Erich and Sifre, Laurent},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2206--2240},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/borgeaud22a/borgeaud22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/borgeaud22a.html},\n abstract = \t {We enhance auto-regressive language models by conditioning on document chunks retrieved from a large corpus, based on local similarity with preceding tokens. With a 2 trillion token database, our Retrieval-Enhanced Transformer (RETRO) obtains comparable performance to GPT-3 and Jurassic-1 on the Pile, despite using 25{\\texttimes} fewer parameters. After fine-tuning, RETRO performance translates to downstream knowledge-intensive tasks such as question answering. RETRO combines a frozen Bert retriever, a differentiable encoder and a chunked cross-attention mechanism to predict tokens based on an order of magnitude more data than what is typically consumed during training. We typically train RETRO from scratch, yet can also rapidly RETROfit pre-trained transformers with retrieval and still achieve good performance. Our work opens up new avenues for improving language models through explicit memory at unprecedented scale.}\n}", "pdf": "https://proceedings.mlr.press/v162/borgeaud22a/borgeaud22a.pdf", "supp": "", "pdf_size": 986918, "gs_citation": 1323, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14088618064924644056&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": ";;;;;;;;;;;;;;;;;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;;;;;;;;;;;;;;;;", "email": ";;;;;;;;;;;;;;;;;;;;;;;;;;;", "github": "", "project": "", "author_num": 28, "oa": "https://proceedings.mlr.press/v162/borgeaud22a.html" }, { "title": "Improving Mini-batch Optimal Transport via Partial Transportation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17551", "id": "17551", "proceeding": "https://proceedings.mlr.press/v162/nguyen22e.html", "poster": "/media/PosterPDFs/ICML%202022/3f53d7190148675e3cd472fc826828c5.png?t=1657398693.1983068", "slides": "", "author_site": "Khai Nguyen, Dang Nguyen, The-Anh Vu-Le, Tung Pham, Nhat Ho", "author": "Khai Nguyen; Dang Nguyen; The-Anh Vu-Le; Tung Pham; Nhat Ho", "abstract": "Mini-batch optimal transport (m-OT) has been widely used recently to deal with the memory issue of OT in large-scale applications. Despite their practicality, m-OT suffers from misspecified mappings, namely, mappings that are optimal on the mini-batch level but are partially wrong in the comparison with the optimal transportation plan between the original measures. Motivated by the misspecified mappings issue, we propose a novel mini-batch method by using partial optimal transport (POT) between mini-batch empirical measures, which we refer to as mini-batch partial optimal transport (m-POT). Leveraging the insight from the partial transportation, we explain the source of misspecified mappings from the m-OT and motivate why limiting the amount of transported masses among mini-batches via POT can alleviate the incorrect mappings. Finally, we carry out extensive experiments on various applications such as deep domain adaptation, partial domain adaptation, deep generative model, color transfer, and gradient flow to demonstrate the favorable performance of m-POT compared to current mini-batch methods.", "bibtex": "@InProceedings{pmlr-v162-nguyen22e,\n title = \t {Improving Mini-batch Optimal Transport via Partial Transportation},\n author = {Nguyen, Khai and Nguyen, Dang and Vu-Le, The-Anh and Pham, Tung and Ho, Nhat},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16656--16690},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nguyen22e/nguyen22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/nguyen22e.html},\n abstract = \t {Mini-batch optimal transport (m-OT) has been widely used recently to deal with the memory issue of OT in large-scale applications. Despite their practicality, m-OT suffers from misspecified mappings, namely, mappings that are optimal on the mini-batch level but are partially wrong in the comparison with the optimal transportation plan between the original measures. Motivated by the misspecified mappings issue, we propose a novel mini-batch method by using partial optimal transport (POT) between mini-batch empirical measures, which we refer to as mini-batch partial optimal transport (m-POT). Leveraging the insight from the partial transportation, we explain the source of misspecified mappings from the m-OT and motivate why limiting the amount of transported masses among mini-batches via POT can alleviate the incorrect mappings. Finally, we carry out extensive experiments on various applications such as deep domain adaptation, partial domain adaptation, deep generative model, color transfer, and gradient flow to demonstrate the favorable performance of m-POT compared to current mini-batch methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/nguyen22e/nguyen22e.pdf", "supp": "", "pdf_size": 5474719, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13182138160286154979&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Statistics and Data Sciences, The University of Texas at Austin; VinAI Research; VinAI Research; VinAI Research; Department of Statistics and Data Sciences, The University of Texas at Austin", "aff_domain": "utexas.edu; ; ; ; ", "email": "utexas.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/nguyen22e.html", "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "University of Texas at Austin;VinAI Research", "aff_unique_dep": "Department of Statistics and Data Sciences;", "aff_unique_url": "https://www.utexas.edu;https://www.vinai.io/", "aff_unique_abbr": "UT Austin;VinAI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "United States;Vietnam" }, { "title": "Improving Out-of-Distribution Robustness via Selective Augmentation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18011", "id": "18011", "proceeding": "https://proceedings.mlr.press/v162/yao22b.html", "poster": "", "slides": "", "author_site": "Huaxiu Yao, Yu Wang, Sai Li, Linjun Zhang, Weixin Liang, James Zou, Chelsea Finn", "author": "Huaxiu Yao; Yu Wang; Sai Li; Linjun Zhang; Weixin Liang; James Zou; Chelsea Finn", "abstract": "Machine learning algorithms typically assume that training and test examples are drawn from the same distribution. However, distribution shift is a common problem in real-world applications and can cause models to perform dramatically worse at test time. In this paper, we specifically consider the problems of subpopulation shifts (e.g., imbalanced data) and domain shifts. While prior works often seek to explicitly regularize internal representations or predictors of the model to be domain invariant, we instead aim to learn invariant predictors without restricting the model\u2019s internal representations or predictors. This leads to a simple mixup-based technique which learns invariant predictors via selective augmentation called LISA. LISA selectively interpolates samples either with the same labels but different domains or with the same domain but different labels. Empirically, we study the effectiveness of LISA on nine benchmarks ranging from subpopulation shifts to domain shifts, and we find that LISA consistently outperforms other state-of-the-art methods and leads to more invariant predictors. We further analyze a linear setting and theoretically show how LISA leads to a smaller worst-group error.", "bibtex": "@InProceedings{pmlr-v162-yao22b,\n title = \t {Improving Out-of-Distribution Robustness via Selective Augmentation},\n author = {Yao, Huaxiu and Wang, Yu and Li, Sai and Zhang, Linjun and Liang, Weixin and Zou, James and Finn, Chelsea},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25407--25437},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yao22b/yao22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/yao22b.html},\n abstract = \t {Machine learning algorithms typically assume that training and test examples are drawn from the same distribution. However, distribution shift is a common problem in real-world applications and can cause models to perform dramatically worse at test time. In this paper, we specifically consider the problems of subpopulation shifts (e.g., imbalanced data) and domain shifts. While prior works often seek to explicitly regularize internal representations or predictors of the model to be domain invariant, we instead aim to learn invariant predictors without restricting the model\u2019s internal representations or predictors. This leads to a simple mixup-based technique which learns invariant predictors via selective augmentation called LISA. LISA selectively interpolates samples either with the same labels but different domains or with the same domain but different labels. Empirically, we study the effectiveness of LISA on nine benchmarks ranging from subpopulation shifts to domain shifts, and we find that LISA consistently outperforms other state-of-the-art methods and leads to more invariant predictors. We further analyze a linear setting and theoretically show how LISA leads to a smaller worst-group error.}\n}", "pdf": "https://proceedings.mlr.press/v162/yao22b/yao22b.pdf", "supp": "", "pdf_size": 5265609, "gs_citation": 237, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4894079975600009568&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Stanford University; University of California San Diego; Renmin University of China; Rutgers University; Stanford University; Stanford University; Stanford University", "aff_domain": "cs.stanford.edu; ;ruc.edu.cn; ; ; ; ", "email": "cs.stanford.edu; ;ruc.edu.cn; ; ; ; ", "github": "https://github.com/huaxiuyao/LISA", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/yao22b.html", "aff_unique_index": "0;1;2;3;0;0;0", "aff_unique_norm": "Stanford University;University of California, San Diego;Renmin University of China;Rutgers University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.stanford.edu;https://ucsd.edu;http://www.ruc.edu.cn;https://www.rutgers.edu", "aff_unique_abbr": "Stanford;UCSD;RUC;Rutgers", "aff_campus_unique_index": "0;1;0;0;0", "aff_campus_unique": "Stanford;San Diego;", "aff_country_unique_index": "0;0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Improving Policy Optimization with Generalist-Specialist Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18319", "id": "18319", "proceeding": "https://proceedings.mlr.press/v162/jia22a.html", "poster": "", "slides": "", "author_site": "Zhiwei Jia, Xuanlin Li, Zhan Ling, Shuang Liu, Yiran Wu, Hao Su", "author": "Zhiwei Jia; Xuanlin Li; Zhan Ling; Shuang Liu; Yiran Wu; Hao Su", "abstract": "Generalization in deep reinforcement learning over unseen environment variations usually requires policy learning over a large set of diverse training variations. We empirically observe that an agent trained on many variations (a generalist) tends to learn faster at the beginning, yet its performance plateaus at a less optimal level for a long time. In contrast, an agent trained only on a few variations (a specialist) can often achieve high returns under a limited computational budget. To have the best of both worlds, we propose a novel generalist-specialist training framework. Specifically, we first train a generalist on all environment variations; when it fails to improve, we launch a large population of specialists with weights cloned from the generalist, each trained to master a selected small subset of variations. We finally resume the training of the generalist with auxiliary rewards induced by demonstrations of all specialists. In particular, we investigate the timing to start specialist training and compare strategies to learn generalists with assistance from specialists. We show that this framework pushes the envelope of policy learning on several challenging and popular benchmarks including Procgen, Meta-World and ManiSkill.", "bibtex": "@InProceedings{pmlr-v162-jia22a,\n title = \t {Improving Policy Optimization with Generalist-Specialist Learning},\n author = {Jia, Zhiwei and Li, Xuanlin and Ling, Zhan and Liu, Shuang and Wu, Yiran and Su, Hao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10104--10119},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jia22a/jia22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jia22a.html},\n abstract = \t {Generalization in deep reinforcement learning over unseen environment variations usually requires policy learning over a large set of diverse training variations. We empirically observe that an agent trained on many variations (a generalist) tends to learn faster at the beginning, yet its performance plateaus at a less optimal level for a long time. In contrast, an agent trained only on a few variations (a specialist) can often achieve high returns under a limited computational budget. To have the best of both worlds, we propose a novel generalist-specialist training framework. Specifically, we first train a generalist on all environment variations; when it fails to improve, we launch a large population of specialists with weights cloned from the generalist, each trained to master a selected small subset of variations. We finally resume the training of the generalist with auxiliary rewards induced by demonstrations of all specialists. In particular, we investigate the timing to start specialist training and compare strategies to learn generalists with assistance from specialists. We show that this framework pushes the envelope of policy learning on several challenging and popular benchmarks including Procgen, Meta-World and ManiSkill.}\n}", "pdf": "https://proceedings.mlr.press/v162/jia22a/jia22a.pdf", "supp": "", "pdf_size": 3828028, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14525219330814535505&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "aff": "University of California, San Diego; University of California, San Diego; University of California, San Diego; University of California, San Diego; University of California, San Diego; University of California, San Diego", "aff_domain": "eng.ucsd.edu; ; ; ; ;eng.ucsd.edu", "email": "eng.ucsd.edu; ; ; ; ;eng.ucsd.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/jia22a.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Improving Robustness against Real-World and Worst-Case Distribution Shifts through Decision Region Quantification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17071", "id": "17071", "proceeding": "https://proceedings.mlr.press/v162/schwinn22a.html", "poster": "/media/PosterPDFs/ICML%202022/7f53f8c6c730af6aeb52e66eb74d8507_fee84xR.png?t=1655815073.979463", "slides": "/media/icml-2022/Slides/17071_QbtTiLe.pdf", "author_site": "Leo Schwinn, Leon Bungert, An Nguyen, Ren\u00e9 Raab, Falk Pulsmeyer, Doina Precup, Bjoern Eskofier, Dario Zanca", "author": "Leo Schwinn; Leon Bungert; An Nguyen; Ren\u00e9 Raab; Falk Pulsmeyer; Doina Precup; Bjoern Eskofier; Dario Zanca", "abstract": "The reliability of neural networks is essential for their use in safety-critical applications. Existing approaches generally aim at improving the robustness of neural networks to either real-world distribution shifts (e.g., common corruptions and perturbations, spatial transformations, and natural adversarial examples) or worst-case distribution shifts (e.g., optimized adversarial examples). In this work, we propose the Decision Region Quantification (DRQ) algorithm to improve the robustness of any differentiable pre-trained model against both real-world and worst-case distribution shifts in the data. DRQ analyzes the robustness of local decision regions in the vicinity of a given data point to make more reliable predictions. We theoretically motivate the DRQ algorithm by showing that it effectively smooths spurious local extrema in the decision surface. Furthermore, we propose an implementation using targeted and untargeted adversarial attacks. An extensive empirical evaluation shows that DRQ increases the robustness of adversarially and non-adversarially trained models against real-world and worst-case distribution shifts on several computer vision benchmark datasets.", "bibtex": "@InProceedings{pmlr-v162-schwinn22a,\n title = \t {Improving Robustness against Real-World and Worst-Case Distribution Shifts through Decision Region Quantification},\n author = {Schwinn, Leo and Bungert, Leon and Nguyen, An and Raab, Ren{\\'e} and Pulsmeyer, Falk and Precup, Doina and Eskofier, Bjoern and Zanca, Dario},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19434--19449},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/schwinn22a/schwinn22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/schwinn22a.html},\n abstract = \t {The reliability of neural networks is essential for their use in safety-critical applications. Existing approaches generally aim at improving the robustness of neural networks to either real-world distribution shifts (e.g., common corruptions and perturbations, spatial transformations, and natural adversarial examples) or worst-case distribution shifts (e.g., optimized adversarial examples). In this work, we propose the Decision Region Quantification (DRQ) algorithm to improve the robustness of any differentiable pre-trained model against both real-world and worst-case distribution shifts in the data. DRQ analyzes the robustness of local decision regions in the vicinity of a given data point to make more reliable predictions. We theoretically motivate the DRQ algorithm by showing that it effectively smooths spurious local extrema in the decision surface. Furthermore, we propose an implementation using targeted and untargeted adversarial attacks. An extensive empirical evaluation shows that DRQ increases the robustness of adversarially and non-adversarially trained models against real-world and worst-case distribution shifts on several computer vision benchmark datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/schwinn22a/schwinn22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/schwinn22a-supp.zip", "pdf_size": 1400890, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8213844209392218653&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Erlangen-N\u00fcrnberg+Mila - Quebec AI Institute; Hausdorff Center for Mathematics, University of Bonn; University of Erlangen-N\u00fcrnberg; University of Erlangen-N\u00fcrnberg; University of Erlangen-N\u00fcrnberg; Mila - Quebec AI Institute+McGill University+Google Deepmind; University of Erlangen-N\u00fcrnberg; University of Erlangen-N\u00fcrnberg", "aff_domain": "fau.de; ; ; ; ;mila.quebec; ; ", "email": "fau.de; ; ; ; ;mila.quebec; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/schwinn22a.html", "aff_unique_index": "0+1;2;0;0;0;1+3+4;0;0", "aff_unique_norm": "Friedrich-Alexander-Universit\u00e4t Erlangen-N\u00fcrnberg;Quebec AI Institute;University of Bonn;McGill University;DeepMind", "aff_unique_dep": ";AI Institute;Hausdorff Center for Mathematics;;DeepMind", "aff_unique_url": "https://www fau.de;https://mila.quebec;https://www.uni-bonn.de;https://www.mcgill.ca;https://deepmind.com", "aff_unique_abbr": "FAU;Mila;;McGill;DeepMind", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0;0;1+1+2;0;0", "aff_country_unique": "Germany;Canada;United Kingdom" }, { "title": "Improving Screening Processes via Calibrated Subset Selection", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17043", "id": "17043", "proceeding": "https://proceedings.mlr.press/v162/wang22j.html", "poster": "", "slides": "/media/icml-2022/Slides/17043.pdf", "author_site": "Luke Lequn Wang, Thorsten Joachims, Manuel Gomez-Rodriguez", "author": "Lequn Wang; Thorsten Joachims; Manuel Gomez Rodriguez", "abstract": "Many selection processes such as finding patients qualifying for a medical trial or retrieval pipelines in search engines consist of multiple stages, where an initial screening stage focuses the resources on shortlisting the most promising candidates. In this paper, we investigate what guarantees a screening classifier can provide, independently of whether it is constructed manually or trained. We find that current solutions do not enjoy distribution-free theoretical guarantees and we show that, in general, even for a perfectly calibrated classifier, there always exist specific pools of candidates for which its shortlist is suboptimal. Then, we develop a distribution-free screening algorithm\u2014called Calibrated Subsect Selection (CSS)\u2014that, given any classifier and some amount of calibration data, finds near-optimal shortlists of candidates that contain a desired number of qualified candidates in expectation. Moreover, we show that a variant of CSS that calibrates a given classifier multiple times across specific groups can create shortlists with provable diversity guarantees. Experiments on US Census survey data validate our theoretical results and show that the shortlists provided by our algorithm are superior to those provided by several competitive baselines.", "bibtex": "@InProceedings{pmlr-v162-wang22j,\n title = \t {Improving Screening Processes via Calibrated Subset Selection},\n author = {Wang, Lequn and Joachims, Thorsten and Rodriguez, Manuel Gomez},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22702--22726},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22j/wang22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22j.html},\n abstract = \t {Many selection processes such as finding patients qualifying for a medical trial or retrieval pipelines in search engines consist of multiple stages, where an initial screening stage focuses the resources on shortlisting the most promising candidates. In this paper, we investigate what guarantees a screening classifier can provide, independently of whether it is constructed manually or trained. We find that current solutions do not enjoy distribution-free theoretical guarantees and we show that, in general, even for a perfectly calibrated classifier, there always exist specific pools of candidates for which its shortlist is suboptimal. Then, we develop a distribution-free screening algorithm\u2014called Calibrated Subsect Selection (CSS)\u2014that, given any classifier and some amount of calibration data, finds near-optimal shortlists of candidates that contain a desired number of qualified candidates in expectation. Moreover, we show that a variant of CSS that calibrates a given classifier multiple times across specific groups can create shortlists with provable diversity guarantees. Experiments on US Census survey data validate our theoretical results and show that the shortlists provided by our algorithm are superior to those provided by several competitive baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22j/wang22j.pdf", "supp": "", "pdf_size": 470471, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9485317495432772346&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Cornell University + Max Planck Institute for Software Systems; Department of Computer Science, Cornell University; Max Planck Institute for Software Systems", "aff_domain": "cornell.edu;cs.cornell.edu;mpi-sws.org", "email": "cornell.edu;cs.cornell.edu;mpi-sws.org", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22j.html", "aff_unique_index": "0+1;0;1", "aff_unique_norm": "Cornell University;Max Planck Institute for Software Systems", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.cornell.edu;https://www.mpi-sws.org", "aff_unique_abbr": "Cornell;MPI-SWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;1", "aff_country_unique": "United States;Germany" }, { "title": "Improving Task-free Continual Learning by Distributionally Robust Memory Evolution", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17575", "id": "17575", "proceeding": "https://proceedings.mlr.press/v162/wang22v.html", "poster": "/media/PosterPDFs/ICML%202022/bbc92a647199b832ec90d7cf57074e9e.png?t=1657329613.5663083", "slides": "", "author_site": "Zhenyi Wang, Li Shen, Le Fang, Qiuling Suo, Tiehang Duan, Mingchen Gao", "author": "Zhenyi Wang; Li Shen; Le Fang; Qiuling Suo; Tiehang Duan; Mingchen Gao", "abstract": "Task-free continual learning (CL) aims to learn a non-stationary data stream without explicit task definitions and not forget previous knowledge. The widely adopted memory replay approach could gradually become less effective for long data streams, as the model may memorize the stored examples and overfit the memory buffer. Second, existing methods overlook the high uncertainty in the memory data distribution since there is a big gap between the memory data distribution and the distribution of all the previous data examples. To address these problems, for the first time, we propose a principled memory evolution framework to dynamically evolve the memory data distribution by making the memory buffer gradually harder to be memorized with distributionally robust optimization (DRO). We then derive a family of methods to evolve the memory buffer data in the continuous probability measure space with Wasserstein gradient flow (WGF). The proposed DRO is w.r.t the worst-case evolved memory data distribution, thus guarantees the model performance and learns significantly more robust features than existing memory-replay-based methods. Extensive experiments on existing benchmarks demonstrate the effectiveness of the proposed methods for alleviating forgetting. As a by-product of the proposed framework, our method is more robust to adversarial examples than existing task-free CL methods.", "bibtex": "@InProceedings{pmlr-v162-wang22v,\n title = \t {Improving Task-free Continual Learning by Distributionally Robust Memory Evolution},\n author = {Wang, Zhenyi and Shen, Li and Fang, Le and Suo, Qiuling and Duan, Tiehang and Gao, Mingchen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22985--22998},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22v/wang22v.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22v.html},\n abstract = \t {Task-free continual learning (CL) aims to learn a non-stationary data stream without explicit task definitions and not forget previous knowledge. The widely adopted memory replay approach could gradually become less effective for long data streams, as the model may memorize the stored examples and overfit the memory buffer. Second, existing methods overlook the high uncertainty in the memory data distribution since there is a big gap between the memory data distribution and the distribution of all the previous data examples. To address these problems, for the first time, we propose a principled memory evolution framework to dynamically evolve the memory data distribution by making the memory buffer gradually harder to be memorized with distributionally robust optimization (DRO). We then derive a family of methods to evolve the memory buffer data in the continuous probability measure space with Wasserstein gradient flow (WGF). The proposed DRO is w.r.t the worst-case evolved memory data distribution, thus guarantees the model performance and learns significantly more robust features than existing memory-replay-based methods. Extensive experiments on existing benchmarks demonstrate the effectiveness of the proposed methods for alleviating forgetting. As a by-product of the proposed framework, our method is more robust to adversarial examples than existing task-free CL methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22v/wang22v.pdf", "supp": "", "pdf_size": 816519, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14894776006626228965&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science and Engineering, University at Buffalo, NY, USA; JD Explore Academy, Beijing, China; Department of Computer Science and Engineering, University at Buffalo, NY, USA; Department of Computer Science and Engineering, University at Buffalo, NY, USA; Meta, Seattle, WA, USA; Department of Computer Science and Engineering, University at Buffalo, NY, USA", "aff_domain": "buffalo.edu;gmail.com; ; ; ;buffalo.edu", "email": "buffalo.edu;gmail.com; ; ; ;buffalo.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wang22v.html", "aff_unique_index": "0;1;0;0;2;0", "aff_unique_norm": "University at Buffalo;JD;Meta", "aff_unique_dep": "Department of Computer Science and Engineering;JD Explore Academy;Meta Platforms, Inc.", "aff_unique_url": "https://www.buffalo.edu;;https://www.meta.com", "aff_unique_abbr": "UB;;Meta", "aff_campus_unique_index": "0;1;0;0;2;0", "aff_campus_unique": "Buffalo;Beijing;Seattle", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Improving Transformers with Probabilistic Attention Keys", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17955", "id": "17955", "proceeding": "https://proceedings.mlr.press/v162/nguyen22c.html", "poster": "/media/PosterPDFs/ICML%202022/4b17d3264fd9070a5be706c853ccd720_vBnNqWw.png?t=1658041046.2584872", "slides": "", "author_site": "Tam Nguyen, Tan Nguyen, Dung Le, Duy Khuong Nguyen, Viet-Anh Tran, Richard Baraniuk, Nhat Ho, Stanley Osher", "author": "Tam Minh Nguyen; Tan Minh Nguyen; Dung D. D. Le; Duy Khuong Nguyen; Viet-Anh Tran; Richard Baraniuk; Nhat Ho; Stanley Osher", "abstract": "Multi-head attention is a driving force behind state-of-the-art transformers, which achieve remarkable performance across a variety of natural language processing (NLP) and computer vision tasks. It has been observed that for many applications, those attention heads learn redundant embedding, and most of them can be removed without degrading the performance of the model. Inspired by this observation, we propose Transformer with a Mixture of Gaussian Keys (Transformer-MGK), a novel transformer architecture that replaces redundant heads in transformers with a mixture of keys at each head. These mixtures of keys follow a Gaussian mixture model and allow each attention head to focus on different parts of the input sequence efficiently. Compared to its conventional transformer counterpart, Transformer-MGK accelerates training and inference, has fewer parameters, and requires fewer FLOPs to compute while achieving comparable or better accuracy across tasks. Transformer-MGK can also be easily extended to use with linear attention. We empirically demonstrate the advantage of Transformer-MGK in a range of practical applications, including language modeling and tasks that involve very long sequences. On the Wikitext-103 and Long Range Arena benchmark, Transformer-MGKs with 4 heads attain comparable or better performance to the baseline transformers with 8 heads.", "bibtex": "@InProceedings{pmlr-v162-nguyen22c,\n title = \t {Improving Transformers with Probabilistic Attention Keys},\n author = {Nguyen, Tam Minh and Nguyen, Tan Minh and Le, Dung D. D. and Nguyen, Duy Khuong and Tran, Viet-Anh and Baraniuk, Richard and Ho, Nhat and Osher, Stanley},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16595--16621},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nguyen22c/nguyen22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/nguyen22c.html},\n abstract = \t {Multi-head attention is a driving force behind state-of-the-art transformers, which achieve remarkable performance across a variety of natural language processing (NLP) and computer vision tasks. It has been observed that for many applications, those attention heads learn redundant embedding, and most of them can be removed without degrading the performance of the model. Inspired by this observation, we propose Transformer with a Mixture of Gaussian Keys (Transformer-MGK), a novel transformer architecture that replaces redundant heads in transformers with a mixture of keys at each head. These mixtures of keys follow a Gaussian mixture model and allow each attention head to focus on different parts of the input sequence efficiently. Compared to its conventional transformer counterpart, Transformer-MGK accelerates training and inference, has fewer parameters, and requires fewer FLOPs to compute while achieving comparable or better accuracy across tasks. Transformer-MGK can also be easily extended to use with linear attention. We empirically demonstrate the advantage of Transformer-MGK in a range of practical applications, including language modeling and tasks that involve very long sequences. On the Wikitext-103 and Long Range Arena benchmark, Transformer-MGKs with 4 heads attain comparable or better performance to the baseline transformers with 8 heads.}\n}", "pdf": "https://proceedings.mlr.press/v162/nguyen22c/nguyen22c.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/nguyen22c-supp.zip", "pdf_size": 3953483, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15369073464631209004&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "email": ";;;;;;;", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/nguyen22c.html" }, { "title": "In defense of dual-encoders for neural ranking", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16409", "id": "16409", "proceeding": "https://proceedings.mlr.press/v162/menon22a.html", "poster": "/media/PosterPDFs/ICML%202022/06997f04a7db92466a2baa6ebc8b872d.png?t=1657544131.7941558", "slides": "", "author_site": "Aditya Menon, Sadeep Jayasumana, Ankit Singh Rawat, Seungyeon Kim, Sashank Jakkam Reddi, Sanjiv Kumar", "author": "Aditya Menon; Sadeep Jayasumana; Ankit Singh Rawat; Seungyeon Kim; Sashank Reddi; Sanjiv Kumar", "abstract": "Transformer-based models such as BERT have proven successful in information retrieval problem, which seek to identify relevant documents for a given query. There are two broad flavours of such models: cross-attention (CA) models, which learn a joint embedding for the query and document, and dual-encoder (DE) models, which learn separate embeddings for the query and document. Empirically, CA models are often found to be more accurate, which has motivated a series of works seeking to bridge this gap. However, a more fundamental question remains less explored: does this performance gap reflect an inherent limitation in the capacity of DE models, or a limitation in the training of such models? And does such an understanding suggest a principled means of improving DE models? In this paper, we study these questions, with three contributions. First, we establish theoretically that with a sufficiently large embedding dimension, DE models have the capacity to model a broad class of score distributions. Second, we show empirically that on real-world problems, DE models may overfit to spurious correlations in the training set, and thus under-perform on test samples. To mitigate this behaviour, we propose a suitable distillation strategy, and confirm its practical efficacy on the MSMARCO-Passage and Natural Questions benchmarks.", "bibtex": "@InProceedings{pmlr-v162-menon22a,\n title = \t {In defense of dual-encoders for neural ranking},\n author = {Menon, Aditya and Jayasumana, Sadeep and Rawat, Ankit Singh and Kim, Seungyeon and Reddi, Sashank and Kumar, Sanjiv},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15376--15400},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/menon22a/menon22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/menon22a.html},\n abstract = \t {Transformer-based models such as BERT have proven successful in information retrieval problem, which seek to identify relevant documents for a given query. There are two broad flavours of such models: cross-attention (CA) models, which learn a joint embedding for the query and document, and dual-encoder (DE) models, which learn separate embeddings for the query and document. Empirically, CA models are often found to be more accurate, which has motivated a series of works seeking to bridge this gap. However, a more fundamental question remains less explored: does this performance gap reflect an inherent limitation in the capacity of DE models, or a limitation in the training of such models? And does such an understanding suggest a principled means of improving DE models? In this paper, we study these questions, with three contributions. First, we establish theoretically that with a sufficiently large embedding dimension, DE models have the capacity to model a broad class of score distributions. Second, we show empirically that on real-world problems, DE models may overfit to spurious correlations in the training set, and thus under-perform on test samples. To mitigate this behaviour, we propose a suitable distillation strategy, and confirm its practical efficacy on the MSMARCO-Passage and Natural Questions benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/menon22a/menon22a.pdf", "supp": "", "pdf_size": 721475, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1752802016858903204&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Google Research, New York, USA; Google Research, New York, USA; Google Research, New York, USA; Google Research, New York, USA; Google Research, New York, USA; Google Research, New York, USA", "aff_domain": "google.com; ; ; ; ; ", "email": "google.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/menon22a.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Independent Policy Gradient for Large-Scale Markov Potential Games: Sharper Rates, Function Approximation, and Game-Agnostic Convergence", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18061", "id": "18061", "proceeding": "https://proceedings.mlr.press/v162/ding22b.html", "poster": "/media/PosterPDFs/ICML%202022/852c296dfa59522f563aef29d8d0adf6_SOn2CBN.png?t=1658257902.8580713", "slides": "", "author_site": "Dongsheng Ding, Chen-Yu Wei, Kaiqing Zhang, Mihailo Jovanovic", "author": "Dongsheng Ding; Chen-Yu Wei; Kaiqing Zhang; Mihailo Jovanovic", "abstract": "We examine global non-asymptotic convergence properties of policy gradient methods for multi-agent reinforcement learning (RL) problems in Markov potential games (MPGs). To learn a Nash equilibrium of an MPG in which the size of state space and/or the number of players can be very large, we propose new independent policy gradient algorithms that are run by all players in tandem. When there is no uncertainty in the gradient evaluation, we show that our algorithm finds an $\\epsilon$-Nash equilibrium with $O(1/\\epsilon^2)$ iteration complexity which does not explicitly depend on the state space size. When the exact gradient is not available, we establish $O(1/\\epsilon^5)$ sample complexity bound in a potentially infinitely large state space for a sample-based algorithm that utilizes function approximation. Moreover, we identify a class of independent policy gradient algorithms that enjoy convergence for both zero-sum Markov games and Markov cooperative games with the players that are oblivious to the types of games being played. Finally, we provide computational experiments to corroborate the merits and the effectiveness of our theoretical developments.", "bibtex": "@InProceedings{pmlr-v162-ding22b,\n title = \t {Independent Policy Gradient for Large-Scale {M}arkov Potential Games: Sharper Rates, Function Approximation, and Game-Agnostic Convergence},\n author = {Ding, Dongsheng and Wei, Chen-Yu and Zhang, Kaiqing and Jovanovic, Mihailo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5166--5220},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ding22b/ding22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/ding22b.html},\n abstract = \t {We examine global non-asymptotic convergence properties of policy gradient methods for multi-agent reinforcement learning (RL) problems in Markov potential games (MPGs). To learn a Nash equilibrium of an MPG in which the size of state space and/or the number of players can be very large, we propose new independent policy gradient algorithms that are run by all players in tandem. When there is no uncertainty in the gradient evaluation, we show that our algorithm finds an $\\epsilon$-Nash equilibrium with $O(1/\\epsilon^2)$ iteration complexity which does not explicitly depend on the state space size. When the exact gradient is not available, we establish $O(1/\\epsilon^5)$ sample complexity bound in a potentially infinitely large state space for a sample-based algorithm that utilizes function approximation. Moreover, we identify a class of independent policy gradient algorithms that enjoy convergence for both zero-sum Markov games and Markov cooperative games with the players that are oblivious to the types of games being played. Finally, we provide computational experiments to corroborate the merits and the effectiveness of our theoretical developments.}\n}", "pdf": "https://proceedings.mlr.press/v162/ding22b/ding22b.pdf", "supp": "", "pdf_size": 977112, "gs_citation": 97, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9803879235723819858&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "University of Southern California; University of Southern California; Massachusetts Institute of Technology; University of Southern California", "aff_domain": "usc.edu;usc.edu;mit.edu;usc.edu", "email": "usc.edu;usc.edu;mit.edu;usc.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/ding22b.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Southern California;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.usc.edu;https://web.mit.edu", "aff_unique_abbr": "USC;MIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Individual Preference Stability for Clustering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17653", "id": "17653", "proceeding": "https://proceedings.mlr.press/v162/ahmadi22a.html", "poster": "/media/PosterPDFs/ICML%202022/9d03333181fb0f6bd495e8b157259880_5eG4LjW.png?t=1658092686.7146702", "slides": "/media/icml-2022/Slides/17653.pdf", "author_site": "Saba Ahmadi, Pranjal Awasthi, Samir Khuller, Matth\u00e4us Kleindessner, Jamie Morgenstern, Pattara Sukprasert, Ali Vakilian", "author": "Saba Ahmadi; Pranjal Awasthi; Samir Khuller; Matth\u00e4us Kleindessner; Jamie Morgenstern; Pattara Sukprasert; Ali Vakilian", "abstract": "In this paper, we propose a natural notion of individual preference (IP) stability for clustering, which asks that every data point, on average, is closer to the points in its own cluster than to the points in any other cluster. Our notion can be motivated from several perspectives, including game theory and algorithmic fairness. We study several questions related to our proposed notion. We first show that deciding whether a given data set allows for an IP-stable clustering in general is NP-hard. As a result, we explore the design of efficient algorithms for finding IP-stable clusterings in some restricted metric spaces. We present a polytime algorithm to find a clustering satisfying exact IP-stability on the real line, and an efficient algorithm to find an IP-stable 2-clustering for a tree metric. We also consider relaxing the stability constraint, i.e., every data point should not be too far from its own cluster compared to any other cluster. For this case, we provide polytime algorithms with different guarantees. We evaluate some of our algorithms and several standard clustering approaches on real data sets.", "bibtex": "@InProceedings{pmlr-v162-ahmadi22a,\n title = \t {Individual Preference Stability for Clustering},\n author = {Ahmadi, Saba and Awasthi, Pranjal and Khuller, Samir and Kleindessner, Matth{\\\"a}us and Morgenstern, Jamie and Sukprasert, Pattara and Vakilian, Ali},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {197--246},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ahmadi22a/ahmadi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ahmadi22a.html},\n abstract = \t {In this paper, we propose a natural notion of individual preference (IP) stability for clustering, which asks that every data point, on average, is closer to the points in its own cluster than to the points in any other cluster. Our notion can be motivated from several perspectives, including game theory and algorithmic fairness. We study several questions related to our proposed notion. We first show that deciding whether a given data set allows for an IP-stable clustering in general is NP-hard. As a result, we explore the design of efficient algorithms for finding IP-stable clusterings in some restricted metric spaces. We present a polytime algorithm to find a clustering satisfying exact IP-stability on the real line, and an efficient algorithm to find an IP-stable 2-clustering for a tree metric. We also consider relaxing the stability constraint, i.e., every data point should not be too far from its own cluster compared to any other cluster. For this case, we provide polytime algorithms with different guarantees. We evaluate some of our algorithms and several standard clustering approaches on real data sets.}\n}", "pdf": "https://proceedings.mlr.press/v162/ahmadi22a/ahmadi22a.pdf", "supp": "", "pdf_size": 8705957, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5704874975941768336&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Toyota Technological Institute at Chicago, USA; Google, USA; Northwestern University, USA; Amazon Web Services, Germany; University of Washington, USA; Northwestern University, USA; Toyota Technological Institute at Chicago, USA", "aff_domain": "ttic.edu;google.com;northwestern.edu;amazon.de;cs.washington.edu;u.northwestern.edu;ttic.edu", "email": "ttic.edu;google.com;northwestern.edu;amazon.de;cs.washington.edu;u.northwestern.edu;ttic.edu", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/ahmadi22a.html", "aff_unique_index": "0;1;2;3;4;2;0", "aff_unique_norm": "Toyota Technological Institute at Chicago;Google;Northwestern University;Amazon;University of Washington", "aff_unique_dep": ";Google;;Amazon Web Services;", "aff_unique_url": "https://www.tti-chicago.org;https://www.google.com;https://www.northwestern.edu;https://aws.amazon.com/de;https://www.washington.edu", "aff_unique_abbr": "TTI Chicago;Google;NU;AWS;UW", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Chicago;Mountain View;", "aff_country_unique_index": "0;0;0;1;0;0;0", "aff_country_unique": "United States;Germany" }, { "title": "Individual Reward Assisted Multi-Agent Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17843", "id": "17843", "proceeding": "https://proceedings.mlr.press/v162/wang22ao.html", "poster": "/media/PosterPDFs/ICML%202022/4fc66104f8ada6257fa55f29a2a567c7.png?t=1657962065.136053", "slides": "", "author_site": "Li Wang, Yupeng Zhang, Yujing Hu, Weixun Wang, Chongjie Zhang, Yang Gao, Jianye Hao, Tangjie Lv, Changjie Fan", "author": "Li Wang; Yupeng Zhang; Yujing Hu; Weixun Wang; Chongjie Zhang; Yang Gao; Jianye Hao; Tangjie Lv; Changjie Fan", "abstract": "In many real-world multi-agent systems, the sparsity of team rewards often makes it difficult for an algorithm to successfully learn a cooperative team policy. At present, the common way for solving this problem is to design some dense individual rewards for the agents to guide the cooperation. However, most existing works utilize individual rewards in ways that do not always promote teamwork and sometimes are even counterproductive. In this paper, we propose", "bibtex": "@InProceedings{pmlr-v162-wang22ao,\n title = \t {Individual Reward Assisted Multi-Agent Reinforcement Learning},\n author = {Wang, Li and Zhang, Yupeng and Hu, Yujing and Wang, Weixun and Zhang, Chongjie and Gao, Yang and Hao, Jianye and Lv, Tangjie and Fan, Changjie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23417--23432},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22ao/wang22ao.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22ao.html},\n abstract = \t {In many real-world multi-agent systems, the sparsity of team rewards often makes it difficult for an algorithm to successfully learn a cooperative team policy. At present, the common way for solving this problem is to design some dense individual rewards for the agents to guide the cooperation. However, most existing works utilize individual rewards in ways that do not always promote teamwork and sometimes are even counterproductive. In this paper, we propose", "pdf": "https://proceedings.mlr.press/v162/wang22ao/wang22ao.pdf", "supp": "", "pdf_size": 1402433, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16031970650666982086&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "github": "", "project": "", "author_num": 9, "oa": "https://proceedings.mlr.press/v162/wang22ao.html" }, { "title": "Inducing Causal Structure for Interpretable Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16345", "id": "16345", "proceeding": "https://proceedings.mlr.press/v162/geiger22a.html", "poster": "", "slides": "", "author_site": "Atticus Geiger, Zhengxuan Wu, Hanson Lu, Joshua Rozner, Elisa Kreiss, Thomas Icard, Noah Goodman, Christopher Potts", "author": "Atticus Geiger; Zhengxuan Wu; Hanson Lu; Josh Rozner; Elisa Kreiss; Thomas Icard; Noah Goodman; Christopher Potts", "abstract": "In many areas, we have well-founded insights about causal structure that would be useful to bring into our trained models while still allowing them to learn in a data-driven fashion. To achieve this, we present the new method of interchange intervention training (IIT). In IIT, we (1) align variables in a causal model (e.g., a deterministic program or Bayesian network) with representations in a neural model and (2) train the neural model to match the counterfactual behavior of the causal model on a base input when aligned representations in both models are set to be the value they would be for a source input. IIT is fully differentiable, flexibly combines with other objectives, and guarantees that the target causal model is a causal abstraction of the neural model when its loss is zero. We evaluate IIT on a structural vision task (MNIST-PVR), a navigational language task (ReaSCAN), and a natural language inference task (MQNLI). We compare IIT against multi-task training objectives and data augmentation. In all our experiments, IIT achieves the best results and produces neural models that are more interpretable in the sense that they more successfully realize the target causal model.", "bibtex": "@InProceedings{pmlr-v162-geiger22a,\n title = \t {Inducing Causal Structure for Interpretable Neural Networks},\n author = {Geiger, Atticus and Wu, Zhengxuan and Lu, Hanson and Rozner, Josh and Kreiss, Elisa and Icard, Thomas and Goodman, Noah and Potts, Christopher},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7324--7338},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/geiger22a/geiger22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/geiger22a.html},\n abstract = \t {In many areas, we have well-founded insights about causal structure that would be useful to bring into our trained models while still allowing them to learn in a data-driven fashion. To achieve this, we present the new method of interchange intervention training (IIT). In IIT, we (1) align variables in a causal model (e.g., a deterministic program or Bayesian network) with representations in a neural model and (2) train the neural model to match the counterfactual behavior of the causal model on a base input when aligned representations in both models are set to be the value they would be for a source input. IIT is fully differentiable, flexibly combines with other objectives, and guarantees that the target causal model is a causal abstraction of the neural model when its loss is zero. We evaluate IIT on a structural vision task (MNIST-PVR), a navigational language task (ReaSCAN), and a natural language inference task (MQNLI). We compare IIT against multi-task training objectives and data augmentation. In all our experiments, IIT achieves the best results and produces neural models that are more interpretable in the sense that they more successfully realize the target causal model.}\n}", "pdf": "https://proceedings.mlr.press/v162/geiger22a/geiger22a.pdf", "supp": "", "pdf_size": 910937, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3318078853003855419&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "aff": "Stanford University; Stanford University; Stanford University; Stanford University; Stanford University; Stanford University; Stanford University; Stanford University", "aff_domain": "stanford.edu;stanford.edu; ; ; ; ; ; ", "email": "stanford.edu;stanford.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/geiger22a.html", "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Inductive Biases and Variable Creation in Self-Attention Mechanisms", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18073", "id": "18073", "proceeding": "https://proceedings.mlr.press/v162/edelman22a.html", "poster": "/media/PosterPDFs/ICML%202022/747d3443e319a22747fbb873e8b2f9f2.png?t=1657832058.0552385", "slides": "", "author_site": "Benjamin Edelman, Surbhi Goel, Sham Kakade, Cyril Zhang", "author": "Benjamin L Edelman; Surbhi Goel; Sham Kakade; Cyril Zhang", "abstract": "Self-attention, an architectural motif designed to model long-range interactions in sequential data, has driven numerous recent breakthroughs in natural language processing and beyond. This work provides a theoretical analysis of the inductive biases of self-attention modules. Our focus is to rigorously establish which functions and long-range dependencies self-attention blocks prefer to represent. Our main result shows that bounded-norm Transformer networks \"create sparse variables\": a single self-attention head can represent a sparse function of the input sequence, with sample complexity scaling only logarithmically with the context length. To support our analysis, we present synthetic experiments to probe the sample complexity of learning sparse Boolean functions with Transformers.", "bibtex": "@InProceedings{pmlr-v162-edelman22a,\n title = \t {Inductive Biases and Variable Creation in Self-Attention Mechanisms},\n author = {Edelman, Benjamin L and Goel, Surbhi and Kakade, Sham and Zhang, Cyril},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5793--5831},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/edelman22a/edelman22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/edelman22a.html},\n abstract = \t {Self-attention, an architectural motif designed to model long-range interactions in sequential data, has driven numerous recent breakthroughs in natural language processing and beyond. This work provides a theoretical analysis of the inductive biases of self-attention modules. Our focus is to rigorously establish which functions and long-range dependencies self-attention blocks prefer to represent. Our main result shows that bounded-norm Transformer networks \"create sparse variables\": a single self-attention head can represent a sparse function of the input sequence, with sample complexity scaling only logarithmically with the context length. To support our analysis, we present synthetic experiments to probe the sample complexity of learning sparse Boolean functions with Transformers.}\n}", "pdf": "https://proceedings.mlr.press/v162/edelman22a/edelman22a.pdf", "supp": "", "pdf_size": 830386, "gs_citation": 159, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1755481428186925773&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Harvard University, Cambridge, MA, USA; Microsoft Research, New York, NY, USA; Department of Computer Science, Harvard University, Cambridge, MA, USA; Microsoft Research, New York, NY, USA", "aff_domain": "microsoft.com; ; ; ", "email": "microsoft.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/edelman22a.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Harvard University;Microsoft", "aff_unique_dep": "Department of Computer Science;Microsoft Research", "aff_unique_url": "https://www.harvard.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Harvard;MSR", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "Cambridge;New York", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Inductive Matrix Completion: No Bad Local Minima and a Fast Algorithm", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17495", "id": "17495", "proceeding": "https://proceedings.mlr.press/v162/zilber22a.html", "poster": "/media/PosterPDFs/ICML%202022/d9fc5b73a8d78fad3d6dffe419384e70_YRpPWOY.png?t=1657524512.4089785", "slides": "/media/icml-2022/Slides/17495.pdf", "author_site": "Pini Zilber, Boaz Nadler", "author": "Pini Zilber; Boaz Nadler", "abstract": "The inductive matrix completion (IMC) problem is to recover a low rank matrix from few observed entries while incorporating prior knowledge about its row and column subspaces. In this work, we make three contributions to the IMC problem: (i) we prove that under suitable conditions, the IMC optimization landscape has no bad local minima; (ii) we derive a simple scheme with theoretical guarantees to estimate the rank of the unknown matrix; and (iii) we propose GNIMC, a simple Gauss-Newton based method to solve the IMC problem, analyze its runtime and derive for it strong recovery guarantees. The guarantees for GNIMC are sharper in several aspects than those available for other methods, including a quadratic convergence rate, fewer required observed entries and stability to errors or deviations from low-rank. Empirically, given entries observed uniformly at random, GNIMC recovers the underlying matrix substantially faster than several competing methods.", "bibtex": "@InProceedings{pmlr-v162-zilber22a,\n title = \t {Inductive Matrix Completion: No Bad Local Minima and a Fast Algorithm},\n author = {Zilber, Pini and Nadler, Boaz},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27671--27692},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zilber22a/zilber22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zilber22a.html},\n abstract = \t {The inductive matrix completion (IMC) problem is to recover a low rank matrix from few observed entries while incorporating prior knowledge about its row and column subspaces. In this work, we make three contributions to the IMC problem: (i) we prove that under suitable conditions, the IMC optimization landscape has no bad local minima; (ii) we derive a simple scheme with theoretical guarantees to estimate the rank of the unknown matrix; and (iii) we propose GNIMC, a simple Gauss-Newton based method to solve the IMC problem, analyze its runtime and derive for it strong recovery guarantees. The guarantees for GNIMC are sharper in several aspects than those available for other methods, including a quadratic convergence rate, fewer required observed entries and stability to errors or deviations from low-rank. Empirically, given entries observed uniformly at random, GNIMC recovers the underlying matrix substantially faster than several competing methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/zilber22a/zilber22a.pdf", "supp": "", "pdf_size": 1687682, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1576217126267485656&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Faculty of Mathematics and Computer Science, Weizmann Institute of Science, Israel; Faculty of Mathematics and Computer Science, Weizmann Institute of Science, Israel", "aff_domain": "weizmann.ac.il;weizmann.ac.il", "email": "weizmann.ac.il;weizmann.ac.il", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/zilber22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Weizmann Institute of Science", "aff_unique_dep": "Faculty of Mathematics and Computer Science", "aff_unique_url": "https://www.weizmann.ac.il", "aff_unique_abbr": "Weizmann", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Inferring Cause and Effect in the Presence of Heteroscedastic Noise", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18423", "id": "18423", "proceeding": "https://proceedings.mlr.press/v162/xu22f.html", "poster": "/media/PosterPDFs/ICML%202022/2f014b914ea5e7c04fc6cbde68d02141.png?t=1657706070.3809102", "slides": "", "author_site": "Sascha Xu, Osman Ali Mian, Alexander Marx, Jilles Vreeken", "author": "Sascha Xu; Osman A Mian; Alexander Marx; Jilles Vreeken", "abstract": "We study the problem of identifying cause and effect over two univariate continuous variables $X$ and $Y$ from a sample of their joint distribution. Our focus lies on the setting when the variance of the noise may be dependent on the cause. We propose to partition the domain of the cause into multiple segments where the noise indeed is dependent. To this end, we minimize a scale-invariant, penalized regression score, finding the optimal partitioning using dynamic programming. We show under which conditions this allows us to identify the causal direction for the linear setting with heteroscedastic noise, for the non-linear setting with homoscedastic noise, as well as empirically confirm that these results generalize to the non-linear and heteroscedastic case. Altogether, the ability to model heteroscedasticity translates into an improved performance in telling cause from effect on a wide range of synthetic and real-world datasets.", "bibtex": "@InProceedings{pmlr-v162-xu22f,\n title = \t {Inferring Cause and Effect in the Presence of Heteroscedastic Noise},\n author = {Xu, Sascha and Mian, Osman A and Marx, Alexander and Vreeken, Jilles},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24615--24630},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22f/xu22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22f.html},\n abstract = \t {We study the problem of identifying cause and effect over two univariate continuous variables $X$ and $Y$ from a sample of their joint distribution. Our focus lies on the setting when the variance of the noise may be dependent on the cause. We propose to partition the domain of the cause into multiple segments where the noise indeed is dependent. To this end, we minimize a scale-invariant, penalized regression score, finding the optimal partitioning using dynamic programming. We show under which conditions this allows us to identify the causal direction for the linear setting with heteroscedastic noise, for the non-linear setting with homoscedastic noise, as well as empirically confirm that these results generalize to the non-linear and heteroscedastic case. Altogether, the ability to model heteroscedasticity translates into an improved performance in telling cause from effect on a wide range of synthetic and real-world datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22f/xu22f.pdf", "supp": "", "pdf_size": 649464, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1236861990839834028&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 17, "aff": "CISPA Helmholtz Center for Information Security, Saarbr\u00fccken, Germany; CISPA Helmholtz Center for Information Security, Saarbr\u00fccken, Germany; ETH Z\u00fcrich & ETH AI Center, Z\u00fcrich, Switzerland; CISPA Helmholtz Center for Information Security, Saarbr\u00fccken, Germany", "aff_domain": "cispa.de; ; ;cispa.de", "email": "cispa.de; ; ;cispa.de", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/xu22f.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "CISPA Helmholtz Center for Information Security;ETH Zurich", "aff_unique_dep": ";", "aff_unique_url": "https://www.cispa.de;https://www.ethz.ch", "aff_unique_abbr": "CISPA;ETH", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Saarbr\u00fccken;Z\u00fcrich", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Germany;Switzerland" }, { "title": "Influence-Augmented Local Simulators: a Scalable Solution for Fast Deep RL in Large Networked Systems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18115", "id": "18115", "proceeding": "https://proceedings.mlr.press/v162/suau22a.html", "poster": "/media/PosterPDFs/ICML%202022/e17a886efc21fa45b9dc49a17c29dcf1.png?t=1657186863.289609", "slides": "", "author_site": "Miguel Suau, Jinke He, Matthijs T. J. Spaan, Frans Oliehoek", "author": "Miguel Suau; Jinke He; Matthijs T. J. Spaan; Frans Oliehoek", "abstract": "Learning effective policies for real-world problems is still an open challenge for the field of reinforcement learning (RL). The main limitation being the amount of data needed and the pace at which that data can be obtained. In this paper, we study how to build lightweight simulators of complicated systems that can run sufficiently fast for deep RL to be applicable. We focus on domains where agents interact with a reduced portion of a larger environment while still being affected by the global dynamics. Our method combines the use of local simulators with learned models that mimic the influence of the global system. The experiments reveal that incorporating this idea into the deep RL workflow can considerably accelerate the training process and presents several opportunities for the future.", "bibtex": "@InProceedings{pmlr-v162-suau22a,\n title = \t {Influence-Augmented Local Simulators: a Scalable Solution for Fast Deep {RL} in Large Networked Systems},\n author = {Suau, Miguel and He, Jinke and Spaan, Matthijs T. J. and Oliehoek, Frans},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20604--20624},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/suau22a/suau22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/suau22a.html},\n abstract = \t {Learning effective policies for real-world problems is still an open challenge for the field of reinforcement learning (RL). The main limitation being the amount of data needed and the pace at which that data can be obtained. In this paper, we study how to build lightweight simulators of complicated systems that can run sufficiently fast for deep RL to be applicable. We focus on domains where agents interact with a reduced portion of a larger environment while still being affected by the global dynamics. Our method combines the use of local simulators with learned models that mimic the influence of the global system. The experiments reveal that incorporating this idea into the deep RL workflow can considerably accelerate the training process and presents several opportunities for the future.}\n}", "pdf": "https://proceedings.mlr.press/v162/suau22a/suau22a.pdf", "supp": "", "pdf_size": 1906032, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12641594553369327116&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "Delft University of Technology; Delft University of Technology; Delft University of Technology; Delft University of Technology", "aff_domain": "tudelft.nl; ; ; ", "email": "tudelft.nl; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/suau22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Delft University of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.tudelft.nl", "aff_unique_abbr": "TU Delft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Netherlands" }, { "title": "Information Discrepancy in Strategic Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16479", "id": "16479", "proceeding": "https://proceedings.mlr.press/v162/bechavod22a.html", "poster": "/media/PosterPDFs/ICML%202022/912d2b1c7b2826caf99687388d2e8f7c.png?t=1657713730.2669437", "slides": "", "author_site": "Yahav Bechavod, Chara Podimata, Steven Wu, Juba Ziani", "author": "Yahav Bechavod; Chara Podimata; Steven Wu; Juba Ziani", "abstract": "We initiate the study of the effects of non-transparency in decision rules on individuals\u2019 ability to improve in strategic learning settings. Inspired by real-life settings, such as loan approvals and college admissions, we remove the assumption typically made in the strategic learning literature, that the decision rule is fully known to individuals, and focus instead on settings where it is inaccessible. In their lack of knowledge, individuals try to infer this rule by learning from their peers (e.g., friends and acquaintances who previously applied for a loan), naturally forming groups in the population, each with possibly different type and level of information regarding the decision rule. We show that, in equilibrium, the principal\u2019s decision rule optimizing welfare across sub-populations may cause a strong negative externality: the true quality of some of the groups can actually deteriorate. On the positive side, we show that, in many natural cases, optimal improvement can be guaranteed simultaneously for all sub-populations. We further introduce a measure we term information overlap proxy, and demonstrate its usefulness in characterizing the disparity in improvements across sub-populations. Finally, we identify a natural condition under which improvement can be guaranteed for all sub-populations while maintaining high predictive accuracy. We complement our theoretical analysis with experiments on real-world datasets.", "bibtex": "@InProceedings{pmlr-v162-bechavod22a,\n title = \t {Information Discrepancy in Strategic Learning},\n author = {Bechavod, Yahav and Podimata, Chara and Wu, Steven and Ziani, Juba},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1691--1715},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bechavod22a/bechavod22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bechavod22a.html},\n abstract = \t {We initiate the study of the effects of non-transparency in decision rules on individuals\u2019 ability to improve in strategic learning settings. Inspired by real-life settings, such as loan approvals and college admissions, we remove the assumption typically made in the strategic learning literature, that the decision rule is fully known to individuals, and focus instead on settings where it is inaccessible. In their lack of knowledge, individuals try to infer this rule by learning from their peers (e.g., friends and acquaintances who previously applied for a loan), naturally forming groups in the population, each with possibly different type and level of information regarding the decision rule. We show that, in equilibrium, the principal\u2019s decision rule optimizing welfare across sub-populations may cause a strong negative externality: the true quality of some of the groups can actually deteriorate. On the positive side, we show that, in many natural cases, optimal improvement can be guaranteed simultaneously for all sub-populations. We further introduce a measure we term information overlap proxy, and demonstrate its usefulness in characterizing the disparity in improvements across sub-populations. Finally, we identify a natural condition under which improvement can be guaranteed for all sub-populations while maintaining high predictive accuracy. We complement our theoretical analysis with experiments on real-world datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/bechavod22a/bechavod22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/bechavod22a-supp.zip", "pdf_size": 521114, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=51926345733327750&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "School of Computer Science and Engineering, The Hebrew University; School of Engineering and Applied Sciences, Harvard University; School of Computer Science, Carnegie Mellon University; School of Industrial and Systems Engineering, Georgia Institute of Technology", "aff_domain": "cs.huji.ac.il;g.harvard.edu;cmu.edu;gatech.edu", "email": "cs.huji.ac.il;g.harvard.edu;cmu.edu;gatech.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/bechavod22a.html", "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Hebrew University;Harvard University;Carnegie Mellon University;Georgia Institute of Technology", "aff_unique_dep": "School of Computer Science and Engineering;School of Engineering and Applied Sciences;School of Computer Science;School of Industrial and Systems Engineering", "aff_unique_url": "http://www.huji.ac.il;https://www.harvard.edu;https://www.cmu.edu;https://www.gatech.edu", "aff_unique_abbr": "HUJI;Harvard;CMU;Georgia Tech", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Cambridge;Pittsburgh;Atlanta", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Israel;United States" }, { "title": "Informed Learning by Wide Neural Networks: Convergence, Generalization and Sampling Complexity", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17387", "id": "17387", "proceeding": "https://proceedings.mlr.press/v162/yang22l.html", "poster": "/media/PosterPDFs/ICML%202022/d554f7bb7be44a7267068a7df88ddd20.png?t=1657999241.8521948", "slides": "", "author_site": "Jianyi Yang, Shaolei Ren", "author": "Jianyi Yang; Shaolei Ren", "abstract": "By integrating domain knowledge with labeled samples, informed machine learning has been emerging to improve the learning performance for a wide range of applications. Nonetheless, rigorous understanding of the role of injected domain knowledge has been under-explored. In this paper, we consider an informed deep neural network (DNN) with over-parameterization and domain knowledge integrated into its training objective function, and study how and why domain knowledge benefits the performance. Concretely, we quantitatively demonstrate the two benefits of domain knowledge in informed learning {\u2014} regularizing the label-based supervision and supplementing the labeled samples {\u2014} and reveal the trade-off between label and knowledge imperfectness in the bound of the population risk. Based on the theoretical analysis, we propose a generalized informed training objective to better exploit the benefits of knowledge and balance the label and knowledge imperfectness, which is validated by the population risk bound. Our analysis on sampling complexity sheds lights on how to choose the hyper-parameters for informed learning, and further justifies the advantages of knowledge informed learning.", "bibtex": "@InProceedings{pmlr-v162-yang22l,\n title = \t {Informed Learning by Wide Neural Networks: Convergence, Generalization and Sampling Complexity},\n author = {Yang, Jianyi and Ren, Shaolei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25198--25240},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22l/yang22l.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22l.html},\n abstract = \t {By integrating domain knowledge with labeled samples, informed machine learning has been emerging to improve the learning performance for a wide range of applications. Nonetheless, rigorous understanding of the role of injected domain knowledge has been under-explored. In this paper, we consider an informed deep neural network (DNN) with over-parameterization and domain knowledge integrated into its training objective function, and study how and why domain knowledge benefits the performance. Concretely, we quantitatively demonstrate the two benefits of domain knowledge in informed learning {\u2014} regularizing the label-based supervision and supplementing the labeled samples {\u2014} and reveal the trade-off between label and knowledge imperfectness in the bound of the population risk. Based on the theoretical analysis, we propose a generalized informed training objective to better exploit the benefits of knowledge and balance the label and knowledge imperfectness, which is validated by the population risk bound. Our analysis on sampling complexity sheds lights on how to choose the hyper-parameters for informed learning, and further justifies the advantages of knowledge informed learning.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22l/yang22l.pdf", "supp": "", "pdf_size": 758012, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14629299159359305395&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Electrical and Computer Engineering, University of California, Riverside, CA 92521, United States; Department of Electrical and Computer Engineering, University of California, Riverside, CA 92521, United States", "aff_domain": "ece.ucr.edu;ece.ucr.edu", "email": "ece.ucr.edu;ece.ucr.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/yang22l.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Riverside", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.ucr.edu", "aff_unique_abbr": "UCR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Riverside", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Injecting Logical Constraints into Neural Networks via Straight-Through Estimators", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18335", "id": "18335", "proceeding": "https://proceedings.mlr.press/v162/yang22h.html", "poster": "/media/PosterPDFs/ICML%202022/f31b20466ae89669f9741e047487eb37_kJER7wR.png?t=1657335752.0009022", "slides": "", "author_site": "Zhun Yang, Joohyung Lee, Chiyoun Park", "author": "Zhun Yang; Joohyung Lee; Chiyoun Park", "abstract": "Injecting discrete logical constraints into neural network learning is one of the main challenges in neuro-symbolic AI. We find that a straight-through-estimator, a method introduced to train binary neural networks, could effectively be applied to incorporate logical constraints into neural network learning. More specifically, we design a systematic way to represent discrete logical constraints as a loss function; minimizing this loss using gradient descent via a straight-through-estimator updates the neural network\u2019s weights in the direction that the binarized outputs satisfy the logical constraints. The experimental results show that by leveraging GPUs and batch training, this method scales significantly better than existing neuro-symbolic methods that require heavy symbolic computation for computing gradients. Also, we demonstrate that our method applies to different types of neural networks, such as MLP, CNN, and GNN, making them learn with no or fewer labeled data by learning directly from known constraints.", "bibtex": "@InProceedings{pmlr-v162-yang22h,\n title = \t {Injecting Logical Constraints into Neural Networks via Straight-Through Estimators},\n author = {Yang, Zhun and Lee, Joohyung and Park, Chiyoun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25096--25122},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22h/yang22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22h.html},\n abstract = \t {Injecting discrete logical constraints into neural network learning is one of the main challenges in neuro-symbolic AI. We find that a straight-through-estimator, a method introduced to train binary neural networks, could effectively be applied to incorporate logical constraints into neural network learning. More specifically, we design a systematic way to represent discrete logical constraints as a loss function; minimizing this loss using gradient descent via a straight-through-estimator updates the neural network\u2019s weights in the direction that the binarized outputs satisfy the logical constraints. The experimental results show that by leveraging GPUs and batch training, this method scales significantly better than existing neuro-symbolic methods that require heavy symbolic computation for computing gradients. Also, we demonstrate that our method applies to different types of neural networks, such as MLP, CNN, and GNN, making them learn with no or fewer labeled data by learning directly from known constraints.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22h/yang22h.pdf", "supp": "", "pdf_size": 1918665, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16565793160241776542&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "School of Computing and Augmented Intelligence, Fulton Schools of Engineering, Arizona State University, Tempe, AZ, USA+Samsung Research, Samsung Electronics Co., Seoul, South Korea; School of Computing and Augmented Intelligence, Fulton Schools of Engineering, Arizona State University, Tempe, AZ, USA; Samsung Research, Samsung Electronics Co., Seoul, South Korea", "aff_domain": "asu.edu;asu.edu;samsung.com", "email": "asu.edu;asu.edu;samsung.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/yang22h.html", "aff_unique_index": "0+1;0;1", "aff_unique_norm": "Arizona State University;Samsung", "aff_unique_dep": "School of Computing and Augmented Intelligence;Samsung Research", "aff_unique_url": "https://www.asu.edu;https://www.samsung.com", "aff_unique_abbr": "ASU;Samsung", "aff_campus_unique_index": "0+1;0;1", "aff_campus_unique": "Tempe;Seoul", "aff_country_unique_index": "0+1;0;1", "aff_country_unique": "United States;South Korea" }, { "title": "Input Dependent Sparse Gaussian Processes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16063", "id": "16063", "proceeding": "https://proceedings.mlr.press/v162/jafrasteh22a.html", "poster": "/media/PosterPDFs/ICML%202022/b32e8760418e68f23c811a1cfd6bda78.png?t=1655897720.958197", "slides": "", "author_site": "Bahram Jafrasteh, Carlos Villacampa-Calvo, Daniel Hernandez-Lobato", "author": "Bahram Jafrasteh; Carlos Villacampa-Calvo; Daniel Hernandez-Lobato", "abstract": "Gaussian Processes (GPs) are non-parametric models that provide accurate uncertainty estimates. Nevertheless, they have a cubic cost in the number of data instances $N$. To overcome this, sparse GP approximations are used, in which a set of $M \\ll N$ inducing points is introduced. The location of the inducing points is learned by considering them parameters of an approximate posterior distribution $q$. Sparse GPs, combined with stochastic variational inference for inferring $q$ have a cost per iteration in $\\mathcal{O}(M^3)$. Critically, the inducing points determine the flexibility of the model and they are often located in regions where the latent function changes. A limitation is, however, that in some tasks a large number of inducing points may be required to obtain good results. To alleviate this, we propose here to amortize the computation of the inducing points locations, as well as the parameters of $q$. For this, we use a neural network that receives a data instance as an input and outputs the corresponding inducing points locations and the parameters of $q$. We evaluate our method in several experiments, showing that it performs similar or better than other state-of-the-art sparse variational GPs. However, in our method the number of inducing points is reduced drastically since they depend on the input data. This makes our method scale to larger datasets and have faster training and prediction times.", "bibtex": "@InProceedings{pmlr-v162-jafrasteh22a,\n title = \t {Input Dependent Sparse {G}aussian Processes},\n author = {Jafrasteh, Bahram and Villacampa-Calvo, Carlos and Hernandez-Lobato, Daniel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9739--9759},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jafrasteh22a/jafrasteh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jafrasteh22a.html},\n abstract = \t {Gaussian Processes (GPs) are non-parametric models that provide accurate uncertainty estimates. Nevertheless, they have a cubic cost in the number of data instances $N$. To overcome this, sparse GP approximations are used, in which a set of $M \\ll N$ inducing points is introduced. The location of the inducing points is learned by considering them parameters of an approximate posterior distribution $q$. Sparse GPs, combined with stochastic variational inference for inferring $q$ have a cost per iteration in $\\mathcal{O}(M^3)$. Critically, the inducing points determine the flexibility of the model and they are often located in regions where the latent function changes. A limitation is, however, that in some tasks a large number of inducing points may be required to obtain good results. To alleviate this, we propose here to amortize the computation of the inducing points locations, as well as the parameters of $q$. For this, we use a neural network that receives a data instance as an input and outputs the corresponding inducing points locations and the parameters of $q$. We evaluate our method in several experiments, showing that it performs similar or better than other state-of-the-art sparse variational GPs. However, in our method the number of inducing points is reduced drastically since they depend on the input data. This makes our method scale to larger datasets and have faster training and prediction times.}\n}", "pdf": "https://proceedings.mlr.press/v162/jafrasteh22a/jafrasteh22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/jafrasteh22a-supp.zip", "pdf_size": 9417002, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5773381144477961556&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": ";;", "aff_domain": ";;", "email": ";;", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/jafrasteh22a.html" }, { "title": "Input-agnostic Certified Group Fairness via Gaussian Parameter Smoothing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18357", "id": "18357", "proceeding": "https://proceedings.mlr.press/v162/jin22g.html", "poster": "/media/PosterPDFs/ICML%202022/5faf461eff3099671ad63c6f3f094f7f.png?t=1657309007.2551386", "slides": "", "author_site": "Jiayin Jin, Zeru Zhang, Yang Zhou, Lingfei Wu", "author": "Jiayin Jin; Zeru Zhang; Yang Zhou; Lingfei Wu", "abstract": "Only recently, researchers attempt to provide classification algorithms with provable group fairness guarantees. Most of these algorithms suffer from harassment caused by the requirement that the training and deployment data follow the same distribution. This paper proposes an input-agnostic certified group fairness algorithm, FairSmooth, for improving the fairness of classification models while maintaining the remarkable prediction accuracy. A Gaussian parameter smoothing method is developed to transform base classifiers into their smooth versions. An optimal individual smooth classifier is learnt for each group with only the data regarding the group and an overall smooth classifier for all groups is generated by averaging the parameters of all the individual smooth ones. By leveraging the theory of nonlinear functional analysis, the smooth classifiers are reformulated as output functions of a Nemytskii operator. Theoretical analysis is conducted to derive that the Nemytskii operator is smooth and induces a Frechet differentiable smooth manifold. We theoretically demonstrate that the smooth manifold has a global Lipschitz constant that is independent of the domain of the input data, which derives the input-agnostic certified group fairness.", "bibtex": "@InProceedings{pmlr-v162-jin22g,\n title = \t {Input-agnostic Certified Group Fairness via {G}aussian Parameter Smoothing},\n author = {Jin, Jiayin and Zhang, Zeru and Zhou, Yang and Wu, Lingfei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10340--10361},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jin22g/jin22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/jin22g.html},\n abstract = \t {Only recently, researchers attempt to provide classification algorithms with provable group fairness guarantees. Most of these algorithms suffer from harassment caused by the requirement that the training and deployment data follow the same distribution. This paper proposes an input-agnostic certified group fairness algorithm, FairSmooth, for improving the fairness of classification models while maintaining the remarkable prediction accuracy. A Gaussian parameter smoothing method is developed to transform base classifiers into their smooth versions. An optimal individual smooth classifier is learnt for each group with only the data regarding the group and an overall smooth classifier for all groups is generated by averaging the parameters of all the individual smooth ones. By leveraging the theory of nonlinear functional analysis, the smooth classifiers are reformulated as output functions of a Nemytskii operator. Theoretical analysis is conducted to derive that the Nemytskii operator is smooth and induces a Frechet differentiable smooth manifold. We theoretically demonstrate that the smooth manifold has a global Lipschitz constant that is independent of the domain of the input data, which derives the input-agnostic certified group fairness.}\n}", "pdf": "https://proceedings.mlr.press/v162/jin22g/jin22g.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/jin22g-supp.zip", "pdf_size": 480501, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12481453734761763862&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Auburn University, USA; Auburn University, USA; Auburn University, USA; JD.COM Silicon Valley Research Center, USA", "aff_domain": "auburn.edu; ; ;email.wm.edu", "email": "auburn.edu; ; ;email.wm.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/jin22g.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Auburn University;JD.com", "aff_unique_dep": ";Research Center", "aff_unique_url": "https://www.auburn.edu;https://www.jd.com", "aff_unique_abbr": "Auburn;JD", "aff_campus_unique_index": "1", "aff_campus_unique": ";Silicon Valley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Instance Dependent Regret Analysis of Kernelized Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16113", "id": "16113", "proceeding": "https://proceedings.mlr.press/v162/shekhar22a.html", "poster": "/media/PosterPDFs/ICML%202022/285f89b802bcb2651801455c86d78f2a.png?t=1657589153.2463799", "slides": "", "author_site": "Shubhanshu Shekhar, Tara Javidi", "author": "Shubhanshu Shekhar; Tara Javidi", "abstract": "We study the problem of designing an adaptive strategy for querying a noisy zeroth-order-oracle to efficiently learn about the optimizer of an unknown function $f$. To make the problem tractable, we assume that $f$ lies in the reproducing kernel Hilbert space (RKHS) associated with a known kernel $K$, with its norm bounded by $M<\\infty$. Prior results, working in a", "bibtex": "@InProceedings{pmlr-v162-shekhar22a,\n title = \t {Instance Dependent Regret Analysis of Kernelized Bandits},\n author = {Shekhar, Shubhanshu and Javidi, Tara},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19747--19772},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shekhar22a/shekhar22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/shekhar22a.html},\n abstract = \t {We study the problem of designing an adaptive strategy for querying a noisy zeroth-order-oracle to efficiently learn about the optimizer of an unknown function $f$. To make the problem tractable, we assume that $f$ lies in the reproducing kernel Hilbert space (RKHS) associated with a known kernel $K$, with its norm bounded by $M<\\infty$. Prior results, working in a", "pdf": "https://proceedings.mlr.press/v162/shekhar22a/shekhar22a.pdf", "supp": "", "pdf_size": 1169834, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14280317575341541290&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Carnegie Mellon University; University of California, San Diego", "aff_domain": "andrew.cmu.edu; ", "email": "andrew.cmu.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/shekhar22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Carnegie Mellon University;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.ucsd.edu", "aff_unique_abbr": "CMU;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Instrumental Variable Regression with Confounder Balancing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17635", "id": "17635", "proceeding": "https://proceedings.mlr.press/v162/wu22e.html", "poster": "/media/PosterPDFs/ICML%202022/54e36c5ff5f6a1802925ca009f3ebb68.png?t=1657548055.9094594", "slides": "/media/icml-2022/Slides/17635.pdf", "author_site": "Anpeng Wu, Kun Kuang, Bo Li, Fei Wu", "author": "Anpeng Wu; Kun Kuang; Bo Li; Fei Wu", "abstract": "This paper considers the challenge of estimating treatment effects from observational data in the presence of unmeasured confounders. A popular way to address this challenge is to utilize an instrumental variable (IV) for two-stage regression, i.e., 2SLS and variants, but limited to the linear setting. Recently, many nonlinear IV regression variants were proposed to overcome it by regressing the treatment with IVs and observed confounders in stage 1, leading to the imbalance of the observed confounders in stage 2. In this paper, we propose a Confounder Balanced IV Regression (CB-IV) algorithm to jointly remove the bias from the unmeasured confounders and balance the observed confounders. To the best of our knowledge, this is the first work to combine confounder balancing in IV regression for treatment effect estimation. Theoretically, we re-define and solve the inverse problems for the response-outcome function. Experiments show that our algorithm outperforms the existing approaches.", "bibtex": "@InProceedings{pmlr-v162-wu22e,\n title = \t {Instrumental Variable Regression with Confounder Balancing},\n author = {Wu, Anpeng and Kuang, Kun and Li, Bo and Wu, Fei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24056--24075},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22e/wu22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22e.html},\n abstract = \t {This paper considers the challenge of estimating treatment effects from observational data in the presence of unmeasured confounders. A popular way to address this challenge is to utilize an instrumental variable (IV) for two-stage regression, i.e., 2SLS and variants, but limited to the linear setting. Recently, many nonlinear IV regression variants were proposed to overcome it by regressing the treatment with IVs and observed confounders in stage 1, leading to the imbalance of the observed confounders in stage 2. In this paper, we propose a Confounder Balanced IV Regression (CB-IV) algorithm to jointly remove the bias from the unmeasured confounders and balance the observed confounders. To the best of our knowledge, this is the first work to combine confounder balancing in IV regression for treatment effect estimation. Theoretically, we re-define and solve the inverse problems for the response-outcome function. Experiments show that our algorithm outperforms the existing approaches.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22e/wu22e.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wu22e-supp.zip", "pdf_size": 686312, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13338106286197529680&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science and Technology, Zhejiang University, China; Department of Computer Science and Technology, Zhejiang University, China; School of Economics and Management, Tsinghua University, China; Shanghai Institute for Advanced Study, Zhejiang University, China + Shanghai AI Laboratory, China", "aff_domain": "zju.edu.cn;zju.edu.cn;tsinghua.edu.cn;zju.edu.cn", "email": "zju.edu.cn;zju.edu.cn;tsinghua.edu.cn;zju.edu.cn", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/wu22e.html", "aff_unique_index": "0;0;1;0+2", "aff_unique_norm": "Zhejiang University;Tsinghua University;Shanghai AI Laboratory", "aff_unique_dep": "Department of Computer Science and Technology;School of Economics and Management;", "aff_unique_url": "http://www.zju.edu.cn;https://www.tsinghua.edu.cn;", "aff_unique_abbr": "ZJU;Tsinghua;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0;0;0;0+0", "aff_country_unique": "China" }, { "title": "Interactive Correlation Clustering with Existential Cluster Constraints", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16455", "id": "16455", "proceeding": "https://proceedings.mlr.press/v162/angell22a.html", "poster": "/media/PosterPDFs/ICML%202022/95f6870ff3dcd442254e334a9033d349.png?t=1658157221.5209808", "slides": "", "author_site": "Rico Angell, Nicholas Monath, Nishant Yadav, Andrew McCallum", "author": "Rico Angell; Nicholas Monath; Nishant Yadav; Andrew Mccallum", "abstract": "We consider the problem of clustering with user feedback. Existing methods express constraints about the input data points, most commonly through must-link and cannot-link constraints on data point pairs. In this paper, we introduce existential cluster constraints: a new form of feedback where users indicate the features of desired clusters. Specifically, users make statements about the existence of a cluster having (and not having) particular features. Our approach has multiple advantages: (1) constraints on clusters can express user intent more efficiently than point pairs; (2) in cases where the users\u2019 mental model is of the desired clusters, it is more natural for users to express cluster-wise preferences; (3) it functions even when privacy restrictions prohibit users from seeing raw data. In addition to introducing existential cluster constraints, we provide an inference algorithm for incorporating our constraints into the output clustering. Finally, we demonstrate empirically that our proposed framework facilitates more accurate clustering with dramatically fewer user feedback inputs.", "bibtex": "@InProceedings{pmlr-v162-angell22a,\n title = \t {Interactive Correlation Clustering with Existential Cluster Constraints},\n author = {Angell, Rico and Monath, Nicholas and Yadav, Nishant and Mccallum, Andrew},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {703--716},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/angell22a/angell22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/angell22a.html},\n abstract = \t {We consider the problem of clustering with user feedback. Existing methods express constraints about the input data points, most commonly through must-link and cannot-link constraints on data point pairs. In this paper, we introduce existential cluster constraints: a new form of feedback where users indicate the features of desired clusters. Specifically, users make statements about the existence of a cluster having (and not having) particular features. Our approach has multiple advantages: (1) constraints on clusters can express user intent more efficiently than point pairs; (2) in cases where the users\u2019 mental model is of the desired clusters, it is more natural for users to express cluster-wise preferences; (3) it functions even when privacy restrictions prohibit users from seeing raw data. In addition to introducing existential cluster constraints, we provide an inference algorithm for incorporating our constraints into the output clustering. Finally, we demonstrate empirically that our proposed framework facilitates more accurate clustering with dramatically fewer user feedback inputs.}\n}", "pdf": "https://proceedings.mlr.press/v162/angell22a/angell22a.pdf", "supp": "", "pdf_size": 599516, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16848197354645183521&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Manning College of Information and Computer Sciences, University of Massachusetts Amherst; Google Research; Manning College of Information and Computer Sciences, University of Massachusetts Amherst; Manning College of Information and Computer Sciences, University of Massachusetts Amherst", "aff_domain": "cs.umass.edu; ; ; ", "email": "cs.umass.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/angell22a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Massachusetts Amherst;Google", "aff_unique_dep": "Manning College of Information and Computer Sciences;Google Research", "aff_unique_url": "https://www.umass.edu;https://research.google", "aff_unique_abbr": "UMass Amherst;Google Research", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Amherst;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Interactive Inverse Reinforcement Learning for Cooperative Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17297", "id": "17297", "proceeding": "https://proceedings.mlr.press/v162/buning22a.html", "poster": "/media/PosterPDFs/ICML%202022/e3408432c1a48a52fb6c74d926b38886.png?t=1657204890.1521783", "slides": "", "author_site": "Thomas Kleine Buening, Anne-Marie George, Christos Dimitrakakis", "author": "Thomas Kleine B\u00fcning; Anne-Marie George; Christos Dimitrakakis", "abstract": "We study the problem of designing autonomous agents that can learn to cooperate effectively with a potentially suboptimal partner while having no access to the joint reward function. This problem is modeled as a cooperative episodic two-agent Markov decision process. We assume control over only the first of the two agents in a Stackelberg formulation of the game, where the second agent is acting so as to maximise expected utility given the first agent\u2019s policy. How should the first agent act in order to learn the joint reward function as quickly as possible and so that the joint policy is as close to optimal as possible? We analyse how knowledge about the reward function can be gained in this interactive two-agent scenario. We show that when the learning agent\u2019s policies have a significant effect on the transition function, the reward function can be learned efficiently.", "bibtex": "@InProceedings{pmlr-v162-buning22a,\n title = \t {Interactive Inverse Reinforcement Learning for Cooperative Games},\n author = {B{\\\"u}ning, Thomas Kleine and George, Anne-Marie and Dimitrakakis, Christos},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2393--2413},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/buning22a/buning22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/buning22a.html},\n abstract = \t {We study the problem of designing autonomous agents that can learn to cooperate effectively with a potentially suboptimal partner while having no access to the joint reward function. This problem is modeled as a cooperative episodic two-agent Markov decision process. We assume control over only the first of the two agents in a Stackelberg formulation of the game, where the second agent is acting so as to maximise expected utility given the first agent\u2019s policy. How should the first agent act in order to learn the joint reward function as quickly as possible and so that the joint policy is as close to optimal as possible? We analyse how knowledge about the reward function can be gained in this interactive two-agent scenario. We show that when the learning agent\u2019s policies have a significant effect on the transition function, the reward function can be learned efficiently.}\n}", "pdf": "https://proceedings.mlr.press/v162/buning22a/buning22a.pdf", "supp": "", "pdf_size": 517369, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16265051782091238494&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "aff": "Department of Informatics, University of Oslo, Oslo, Norway+Department of Computer Science, University of Neuchatel, Neuchatel, Switzerland+Department of Computer Science and Engineering, Chalmers University of Technology, Gothenburg, Sweden; Department of Informatics, University of Oslo, Oslo, Norway+Department of Computer Science, University of Neuchatel, Neuchatel, Switzerland+Department of Computer Science and Engineering, Chalmers University of Technology, Gothenburg, Sweden; Department of Informatics, University of Oslo, Oslo, Norway+Department of Computer Science, University of Neuchatel, Neuchatel, Switzerland+Department of Computer Science and Engineering, Chalmers University of Technology, Gothenburg, Sweden", "aff_domain": "ifi.uio.no; ; ", "email": "ifi.uio.no; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/buning22a.html", "aff_unique_index": "0+1+2;0+1+2;0+1+2", "aff_unique_norm": "University of Oslo;University of Neuchatel;Chalmers University of Technology", "aff_unique_dep": "Department of Informatics;Department of Computer Science;Department of Computer Science and Engineering", "aff_unique_url": "https://www.uio.no;https://www.unine.ch;https://www.chalmers.se", "aff_unique_abbr": "UiO;UNINE;Chalmers", "aff_campus_unique_index": "0+1+2;0+1+2;0+1+2", "aff_campus_unique": "Oslo;Neuchatel;Gothenburg", "aff_country_unique_index": "0+1+2;0+1+2;0+1+2", "aff_country_unique": "Norway;Switzerland;Sweden" }, { "title": "Interactively Learning Preference Constraints in Linear Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17119", "id": "17119", "proceeding": "https://proceedings.mlr.press/v162/lindner22a.html", "poster": "/media/PosterPDFs/ICML%202022/0188e8b8b014829e2fa0f430f0a95961.png?t=1657554415.1039662", "slides": "", "author_site": "David Lindner, Sebastian Tschiatschek, Katja Hofmann, Andreas Krause", "author": "David Lindner; Sebastian Tschiatschek; Katja Hofmann; Andreas Krause", "abstract": "We study sequential decision-making with known rewards and unknown constraints, motivated by situations where the constraints represent expensive-to-evaluate human preferences, such as safe and comfortable driving behavior. We formalize the challenge of interactively learning about these constraints as a novel linear bandit problem which we call constrained linear best-arm identification. To solve this problem, we propose the Adaptive Constraint Learning (ACOL) algorithm. We provide an instance-dependent lower bound for constrained linear best-arm identification and show that ACOL\u2019s sample complexity matches the lower bound in the worst-case. In the average case, ACOL\u2019s sample complexity bound is still significantly tighter than bounds of simpler approaches. In synthetic experiments, ACOL performs on par with an oracle solution and outperforms a range of baselines. As an application, we consider learning constraints to represent human preferences in a driving simulation. ACOL is significantly more sample efficient than alternatives for this application. Further, we find that learning preferences as constraints is more robust to changes in the driving scenario than encoding the preferences directly in the reward function.", "bibtex": "@InProceedings{pmlr-v162-lindner22a,\n title = \t {Interactively Learning Preference Constraints in Linear Bandits},\n author = {Lindner, David and Tschiatschek, Sebastian and Hofmann, Katja and Krause, Andreas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13505--13527},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lindner22a/lindner22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lindner22a.html},\n abstract = \t {We study sequential decision-making with known rewards and unknown constraints, motivated by situations where the constraints represent expensive-to-evaluate human preferences, such as safe and comfortable driving behavior. We formalize the challenge of interactively learning about these constraints as a novel linear bandit problem which we call constrained linear best-arm identification. To solve this problem, we propose the Adaptive Constraint Learning (ACOL) algorithm. We provide an instance-dependent lower bound for constrained linear best-arm identification and show that ACOL\u2019s sample complexity matches the lower bound in the worst-case. In the average case, ACOL\u2019s sample complexity bound is still significantly tighter than bounds of simpler approaches. In synthetic experiments, ACOL performs on par with an oracle solution and outperforms a range of baselines. As an application, we consider learning constraints to represent human preferences in a driving simulation. ACOL is significantly more sample efficient than alternatives for this application. Further, we find that learning preferences as constraints is more robust to changes in the driving scenario than encoding the preferences directly in the reward function.}\n}", "pdf": "https://proceedings.mlr.press/v162/lindner22a/lindner22a.pdf", "supp": "", "pdf_size": 6370705, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10442761554995680158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "ETH Zurich, Switzerland; University of Vienna, Austria; Microsoft Research Cambridge, UK; ETH Zurich, Switzerland", "aff_domain": "inf.ethz.ch; ; ; ", "email": "inf.ethz.ch; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lindner22a.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "ETH Zurich;University of Vienna;Microsoft", "aff_unique_dep": ";;Research", "aff_unique_url": "https://www.ethz.ch;https://univie.ac.at;https://www.microsoft.com/en-us/research/group/cambridge", "aff_unique_abbr": "ETHZ;UV;MSR Cambridge", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "Switzerland;Austria;United Kingdom" }, { "title": "Interpretable Neural Networks with Frank-Wolfe: Sparse Relevance Maps and Relevance Orderings", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16181", "id": "16181", "proceeding": "https://proceedings.mlr.press/v162/macdonald22a.html", "poster": "/media/PosterPDFs/ICML%202022/ae3f58a127f6c1f12c4942432d1f246a.png?t=1657617889.5017173", "slides": "", "author_site": "Jan Macdonald, Mathieu Besan\u00e7on, Sebastian Pokutta", "author": "Jan Macdonald; Mathieu E. Besan\u00e7on; Sebastian Pokutta", "abstract": "We study the effects of constrained optimization formulations and Frank-Wolfe algorithms for obtaining interpretable neural network predictions. Reformulating the Rate-Distortion Explanations (RDE) method for relevance attribution as a constrained optimization problem provides precise control over the sparsity of relevance maps. This enables a novel multi-rate as well as a relevance-ordering variant of RDE that both empirically outperform standard RDE and other baseline methods in a well-established comparison test. We showcase several deterministic and stochastic variants of the Frank-Wolfe algorithm and their effectiveness for RDE.", "bibtex": "@InProceedings{pmlr-v162-macdonald22a,\n title = \t {Interpretable Neural Networks with Frank-Wolfe: Sparse Relevance Maps and Relevance Orderings},\n author = {Macdonald, Jan and Besan{\\c{c}}on, Mathieu E. and Pokutta, Sebastian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14699--14716},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/macdonald22a/macdonald22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/macdonald22a.html},\n abstract = \t {We study the effects of constrained optimization formulations and Frank-Wolfe algorithms for obtaining interpretable neural network predictions. Reformulating the Rate-Distortion Explanations (RDE) method for relevance attribution as a constrained optimization problem provides precise control over the sparsity of relevance maps. This enables a novel multi-rate as well as a relevance-ordering variant of RDE that both empirically outperform standard RDE and other baseline methods in a well-established comparison test. We showcase several deterministic and stochastic variants of the Frank-Wolfe algorithm and their effectiveness for RDE.}\n}", "pdf": "https://proceedings.mlr.press/v162/macdonald22a/macdonald22a.pdf", "supp": "", "pdf_size": 9779650, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1124674536822867580&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Institut f\u00fcr Mathematik, Technische Universit\u00e4t Berlin, Germany+Department for AI in Society, Science, and Technology, Zuse Institute Berlin, Germany; Department for AI in Society, Science, and Technology, Zuse Institute Berlin, Germany; Institut f\u00fcr Mathematik, Technische Universit\u00e4t Berlin, Germany+Department for AI in Society, Science, and Technology, Zuse Institute Berlin, Germany", "aff_domain": "math.tu-berlin.de; ; ", "email": "math.tu-berlin.de; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/macdonald22a.html", "aff_unique_index": "0+1;1;0+1", "aff_unique_norm": "Technische Universit\u00e4t Berlin;Zuse Institute Berlin", "aff_unique_dep": "Institut f\u00fcr Mathematik;Department for AI in Society, Science, and Technology", "aff_unique_url": "https://www.tu-berlin.de;https://www.zib.de", "aff_unique_abbr": "TU Berlin;ZIB", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Berlin", "aff_country_unique_index": "0+0;0;0+0", "aff_country_unique": "Germany" }, { "title": "Interpretable Off-Policy Learning via Hyperbox Search", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17689", "id": "17689", "proceeding": "https://proceedings.mlr.press/v162/tschernutter22a.html", "poster": "/media/PosterPDFs/ICML%202022/6213a8959a9a96589ca484dfd1e25053_bwmzElg.png?t=1657199086.8508646", "slides": "", "author_site": "Daniel Tschernutter, Tobias Hatt, Stefan Feuerriegel", "author": "Daniel Tschernutter; Tobias Hatt; Stefan Feuerriegel", "abstract": "Personalized treatment decisions have become an integral part of modern medicine. Thereby, the aim is to make treatment decisions based on individual patient characteristics. Numerous methods have been developed for learning such policies from observational data that achieve the best outcome across a certain policy class. Yet these methods are rarely interpretable. However, interpretability is often a prerequisite for policy learning in clinical practice. In this paper, we propose an algorithm for interpretable off-policy learning via hyperbox search. In particular, our policies can be represented in disjunctive normal form (i.e., OR-of-ANDs) and are thus intelligible. We prove a universal approximation theorem that shows that our policy class is flexible enough to approximate any measurable function arbitrarily well. For optimization, we develop a tailored column generation procedure within a branch-and-bound framework. Using a simulation study, we demonstrate that our algorithm outperforms state-of-the-art methods from interpretable off-policy learning in terms of regret. Using real-word clinical data, we perform a user study with actual clinical experts, who rate our policies as highly interpretable.", "bibtex": "@InProceedings{pmlr-v162-tschernutter22a,\n title = \t {Interpretable Off-Policy Learning via Hyperbox Search},\n author = {Tschernutter, Daniel and Hatt, Tobias and Feuerriegel, Stefan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21795--21827},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tschernutter22a/tschernutter22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tschernutter22a.html},\n abstract = \t {Personalized treatment decisions have become an integral part of modern medicine. Thereby, the aim is to make treatment decisions based on individual patient characteristics. Numerous methods have been developed for learning such policies from observational data that achieve the best outcome across a certain policy class. Yet these methods are rarely interpretable. However, interpretability is often a prerequisite for policy learning in clinical practice. In this paper, we propose an algorithm for interpretable off-policy learning via hyperbox search. In particular, our policies can be represented in disjunctive normal form (i.e., OR-of-ANDs) and are thus intelligible. We prove a universal approximation theorem that shows that our policy class is flexible enough to approximate any measurable function arbitrarily well. For optimization, we develop a tailored column generation procedure within a branch-and-bound framework. Using a simulation study, we demonstrate that our algorithm outperforms state-of-the-art methods from interpretable off-policy learning in terms of regret. Using real-word clinical data, we perform a user study with actual clinical experts, who rate our policies as highly interpretable.}\n}", "pdf": "https://proceedings.mlr.press/v162/tschernutter22a/tschernutter22a.pdf", "supp": "", "pdf_size": 4081264, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=808630799671735752&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "ETH Zurich, Switzerland; ETH Zurich, Switzerland; ETH Zurich, Switzerland+LMU, Germany", "aff_domain": "ethz.ch;ethz.ch; ", "email": "ethz.ch;ethz.ch; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/tschernutter22a.html", "aff_unique_index": "0;0;0+1", "aff_unique_norm": "ETH Zurich;Ludwig Maximilian University of Munich", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.lmu.de", "aff_unique_abbr": "ETHZ;LMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1", "aff_country_unique": "Switzerland;Germany" }, { "title": "Interpretable and Generalizable Graph Learning via Stochastic Attention Mechanism", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17429", "id": "17429", "proceeding": "https://proceedings.mlr.press/v162/miao22a.html", "poster": "/media/PosterPDFs/ICML%202022/a8acc28734d4fe90ea24353d901ae678.png?t=1658117701.7955306", "slides": "", "author_site": "Siqi Miao, Mia Liu, Pan Li", "author": "Siqi Miao; Mia Liu; Pan Li", "abstract": "Interpretable graph learning is in need as many scientific applications depend on learning models to collect insights from graph-structured data. Previous works mostly focused on using post-hoc approaches to interpret pre-trained models (graph neural networks in particular). They argue against inherently interpretable models because the good interpretability of these models is often at the cost of their prediction accuracy. However, those post-hoc methods often fail to provide stable interpretation and may extract features that are spuriously correlated with the task. In this work, we address these issues by proposing Graph Stochastic Attention (GSAT). Derived from the information bottleneck principle, GSAT injects stochasticity to the attention weights to block the information from task-irrelevant graph components while learning stochasticity-reduced attention to select task-relevant subgraphs for interpretation. The selected subgraphs provably do not contain patterns that are spuriously correlated with the task under some assumptions. Extensive experiments on eight datasets show that GSAT outperforms the state-of-the-art methods by up to 20% in interpretation AUC and 5% in prediction accuracy. Our code is available at https://github.com/Graph-COM/GSAT.", "bibtex": "@InProceedings{pmlr-v162-miao22a,\n title = \t {Interpretable and Generalizable Graph Learning via Stochastic Attention Mechanism},\n author = {Miao, Siqi and Liu, Mia and Li, Pan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15524--15543},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/miao22a/miao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/miao22a.html},\n abstract = \t {Interpretable graph learning is in need as many scientific applications depend on learning models to collect insights from graph-structured data. Previous works mostly focused on using post-hoc approaches to interpret pre-trained models (graph neural networks in particular). They argue against inherently interpretable models because the good interpretability of these models is often at the cost of their prediction accuracy. However, those post-hoc methods often fail to provide stable interpretation and may extract features that are spuriously correlated with the task. In this work, we address these issues by proposing Graph Stochastic Attention (GSAT). Derived from the information bottleneck principle, GSAT injects stochasticity to the attention weights to block the information from task-irrelevant graph components while learning stochasticity-reduced attention to select task-relevant subgraphs for interpretation. The selected subgraphs provably do not contain patterns that are spuriously correlated with the task under some assumptions. Extensive experiments on eight datasets show that GSAT outperforms the state-of-the-art methods by up to 20% in interpretation AUC and 5% in prediction accuracy. Our code is available at https://github.com/Graph-COM/GSAT.}\n}", "pdf": "https://proceedings.mlr.press/v162/miao22a/miao22a.pdf", "supp": "", "pdf_size": 1848728, "gs_citation": 277, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15869188404391034141&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, Purdue University, West Lafayette, USA+Department of Physics and Astronomy, Purdue University, West Lafayette, USA; Department of Physics and Astronomy, Purdue University, West Lafayette, USA; Department of Computer Science, Purdue University, West Lafayette, USA", "aff_domain": "purdue.edu; ;purdue.edu", "email": "purdue.edu; ;purdue.edu", "github": "https://github.com/Graph-COM/GSAT", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/miao22a.html", "aff_unique_index": "0+0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "0+0;0;0", "aff_campus_unique": "West Lafayette", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "Interventional Contrastive Learning with Meta Semantic Regularizer", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17865", "id": "17865", "proceeding": "https://proceedings.mlr.press/v162/qiang22a.html", "poster": "/media/PosterPDFs/ICML%202022/5cf21ce30208cfffaa832c6e44bb567d.png?t=1656434768.8121533", "slides": "/media/icml-2022/Slides/17865.pdf", "author_site": "Wenwen Qiang, Jiangmeng Li, Changwen Zheng, Bing Su, Hui Xiong", "author": "Wenwen Qiang; Jiangmeng Li; Changwen Zheng; Bing Su; Hui Xiong", "abstract": "Contrastive learning (CL)-based self-supervised learning models learn visual representations in a pairwise manner. Although the prevailing CL model has achieved great progress, in this paper, we uncover an ever-overlooked phenomenon: When the CL model is trained with full images, the performance tested in full images is better than that in foreground areas; when the CL model is trained with foreground areas, the performance tested in full images is worse than that in foreground areas. This observation reveals that backgrounds in images may interfere with the model learning semantic information and their influence has not been fully eliminated. To tackle this issue, we build a Structural Causal Model (SCM) to model the background as a confounder. We propose a backdoor adjustment-based regularization method, namely", "bibtex": "@InProceedings{pmlr-v162-qiang22a,\n title = \t {Interventional Contrastive Learning with Meta Semantic Regularizer},\n author = {Qiang, Wenwen and Li, Jiangmeng and Zheng, Changwen and Su, Bing and Xiong, Hui},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18018--18030},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qiang22a/qiang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/qiang22a.html},\n abstract = \t {Contrastive learning (CL)-based self-supervised learning models learn visual representations in a pairwise manner. Although the prevailing CL model has achieved great progress, in this paper, we uncover an ever-overlooked phenomenon: When the CL model is trained with full images, the performance tested in full images is better than that in foreground areas; when the CL model is trained with foreground areas, the performance tested in full images is worse than that in foreground areas. This observation reveals that backgrounds in images may interfere with the model learning semantic information and their influence has not been fully eliminated. To tackle this issue, we build a Structural Causal Model (SCM) to model the background as a confounder. We propose a backdoor adjustment-based regularization method, namely", "pdf": "https://proceedings.mlr.press/v162/qiang22a/qiang22a.pdf", "supp": "", "pdf_size": 1042232, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5879798822364157543&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Science & Technology on Integrated Information System Laboratory, Institute of Software Chinese Academy of Sciences, Beijing, China+University of Chinese Academy of Sciences, Beijing, China+Southern Marine Science and Engineering Guangdong Laboratory (Guangzhou), Guangdong, China; Science & Technology on Integrated Information System Laboratory, Institute of Software Chinese Academy of Sciences, Beijing, China+University of Chinese Academy of Sciences, Beijing, China+Southern Marine Science and Engineering Guangdong Laboratory (Guangzhou), Guangdong, China; Science & Technology on Integrated Information System Laboratory, Institute of Software Chinese Academy of Sciences, Beijing, China+Southern Marine Science and Engineering Guangdong Laboratory (Guangzhou), Guangdong, China; Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China+Beijing Key Laboratory of Big Data Management and Analysis Methods, Beijing, China; Thrust of Artificial Intelligence, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China+Department of Computer Science Engineering, The Hong Kong University of Science and Technology, Hong Kong SAR, China", "aff_domain": "example.com;example.com;example.com;gmail.com;example.com", "email": "example.com;example.com;example.com;gmail.com;example.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/qiang22a.html", "aff_unique_index": "0+1+2;0+1+2;0+2;3+4;5+5", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Southern Marine Science and Engineering Guangdong Laboratory;Renmin University of China;Beijing Key Laboratory of Big Data Management and Analysis Methods;Hong Kong University of Science and Technology", "aff_unique_dep": "Institute of Software;;Marine Science and Engineering;Gaoling School of Artificial Intelligence;Big Data Management and Analysis;Thrust of Artificial Intelligence", "aff_unique_url": "http://www.ios.ac.cn;http://www.ucas.ac.cn;;http://www.ruc.edu.cn;;https://www.ust.hk", "aff_unique_abbr": "CAS;UCAS;;RUC;;HKUST", "aff_campus_unique_index": "0+0+1;0+0+1;0+1;0+0;1+2", "aff_campus_unique": "Beijing;Guangzhou;Hong Kong", "aff_country_unique_index": "0+0+0;0+0+0;0+0;0+0;0+0", "aff_country_unique": "China" }, { "title": "Intriguing Properties of Input-Dependent Randomized Smoothing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17745", "id": "17745", "proceeding": "https://proceedings.mlr.press/v162/sukeni-k22a.html", "poster": "/media/PosterPDFs/ICML%202022/a29d1598024f9e87beab4b98411d48ce.png?t=1658082529.4291174", "slides": "", "author_site": "Peter S\u00faken\u00edk, Aleksei Kuvshinov, Stephan G\u00fcnnemann", "author": "Peter S\u00faken\u0131\u0301k; Aleksei Kuvshinov; Stephan G\u00fcnnemann", "abstract": "Randomized smoothing is currently considered the state-of-the-art method to obtain certifiably robust classifiers. Despite its remarkable performance, the method is associated with various serious problems such as \u201ccertified accuracy waterfalls\u201d, certification vs. accuracy trade-off, or even fairness issues. Input-dependent smoothing approaches have been proposed with intention of overcoming these flaws. However, we demonstrate that these methods lack formal guarantees and so the resulting certificates are not justified. We show that in general, the input-dependent smoothing suffers from the curse of dimensionality, forcing the variance function to have low semi-elasticity. On the other hand, we provide a theoretical and practical framework that enables the usage of input-dependent smoothing even in the presence of the curse of dimensionality, under strict restrictions. We present one concrete design of the smoothing variance function and test it on CIFAR10 and MNIST. Our design mitigates some of the problems of classical smoothing and is formally underlined, yet further improvement of the design is still necessary.", "bibtex": "@InProceedings{pmlr-v162-sukeni-k22a,\n title = \t {Intriguing Properties of Input-Dependent Randomized Smoothing},\n author = {S{\\'u}ken\\'{\\i}k, Peter and Kuvshinov, Aleksei and G{\\\"u}nnemann, Stephan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20697--20743},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sukeni-k22a/sukeni-k22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sukeni-k22a.html},\n abstract = \t {Randomized smoothing is currently considered the state-of-the-art method to obtain certifiably robust classifiers. Despite its remarkable performance, the method is associated with various serious problems such as \u201ccertified accuracy waterfalls\u201d, certification vs. accuracy trade-off, or even fairness issues. Input-dependent smoothing approaches have been proposed with intention of overcoming these flaws. However, we demonstrate that these methods lack formal guarantees and so the resulting certificates are not justified. We show that in general, the input-dependent smoothing suffers from the curse of dimensionality, forcing the variance function to have low semi-elasticity. On the other hand, we provide a theoretical and practical framework that enables the usage of input-dependent smoothing even in the presence of the curse of dimensionality, under strict restrictions. We present one concrete design of the smoothing variance function and test it on CIFAR10 and MNIST. Our design mitigates some of the problems of classical smoothing and is formally underlined, yet further improvement of the design is still necessary.}\n}", "pdf": "https://proceedings.mlr.press/v162/sukeni-k22a/sukeni-k22a.pdf", "supp": "", "pdf_size": 8470811, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6155870339923015137&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Institute of Science and Technology Austria, Klosterneuburg, Austria; Technical University of Munich, School of Computation, Information and Technology, Munich, Germany; Munich Data Science Institute, Munich, Germany", "aff_domain": "ista.ac.at; ; ", "email": "ista.ac.at; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sukeni-k22a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Institute of Science and Technology Austria;Technical University of Munich;Munich Data Science Institute", "aff_unique_dep": ";School of Computation, Information and Technology;", "aff_unique_url": "https://www.ist.ac.at;https://www.tum.de;", "aff_unique_abbr": "IST Austria;TUM;", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Klosterneuburg;Munich", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Austria;Germany" }, { "title": "Invariant Ancestry Search", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16247", "id": "16247", "proceeding": "https://proceedings.mlr.press/v162/mogensen22a.html", "poster": "/media/PosterPDFs/ICML%202022/569ff987c643b4bedf504efda8f786c2.png?t=1657798743.3747752", "slides": "", "author_site": "Phillip Bredahl Mogensen, Nikolaj Thams, Jonas Peters", "author": "Phillip B Mogensen; Nikolaj Thams; Jonas Peters", "abstract": "Recently, methods have been proposed that exploit the invariance of prediction models with respect to changing environments to infer subsets of the causal parents of a response variable. If the environments influence only few of the underlying mechanisms, the subset identified by invariant causal prediction (ICP), for example, may be small, or even empty. We introduce the concept of minimal invariance and propose invariant ancestry search (IAS). In its population version, IAS outputs a set which contains only ancestors of the response and is a superset of the output of ICP. When applied to data, corresponding guarantees hold asymptotically if the underlying test for invariance has asymptotic level and power. We develop scalable algorithms and perform experiments on simulated and real data.", "bibtex": "@InProceedings{pmlr-v162-mogensen22a,\n title = \t {Invariant Ancestry Search},\n author = {Mogensen, Phillip B and Thams, Nikolaj and Peters, Jonas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15832--15857},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mogensen22a/mogensen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mogensen22a.html},\n abstract = \t {Recently, methods have been proposed that exploit the invariance of prediction models with respect to changing environments to infer subsets of the causal parents of a response variable. If the environments influence only few of the underlying mechanisms, the subset identified by invariant causal prediction (ICP), for example, may be small, or even empty. We introduce the concept of minimal invariance and propose invariant ancestry search (IAS). In its population version, IAS outputs a set which contains only ancestors of the response and is a superset of the output of ICP. When applied to data, corresponding guarantees hold asymptotically if the underlying test for invariance has asymptotic level and power. We develop scalable algorithms and perform experiments on simulated and real data.}\n}", "pdf": "https://proceedings.mlr.press/v162/mogensen22a/mogensen22a.pdf", "supp": "", "pdf_size": 1301875, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7085135570627495556&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Mathematical Sciences, University of Copenhagen, Denmark; Department of Mathematical Sciences, University of Copenhagen, Denmark; Department of Mathematical Sciences, University of Copenhagen, Denmark", "aff_domain": "math.ku.dk; ;math.ku.dk", "email": "math.ku.dk; ;math.ku.dk", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/mogensen22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Copenhagen", "aff_unique_dep": "Department of Mathematical Sciences", "aff_unique_url": "https://www.ku.dk", "aff_unique_abbr": "UCPH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Denmark" }, { "title": "Inverse Contextual Bandits: Learning How Behavior Evolves over Time", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16317", "id": "16317", "proceeding": "https://proceedings.mlr.press/v162/huyuk22a.html", "poster": "/media/PosterPDFs/ICML%202022/819e3d6c1381eac87c17617e5165f38c.png?t=1657579268.9888756", "slides": "", "author_site": "Alihan H\u00fcy\u00fck, Daniel Jarrett, Mihaela van der Schaar", "author": "Alihan H\u00fcy\u00fck; Daniel Jarrett; Mihaela van der Schaar", "abstract": "Understanding a decision-maker\u2019s priorities by observing their behavior is critical for transparency and accountability in decision processes{\u2014}such as in healthcare. Though conventional approaches to policy learning almost invariably assume stationarity in behavior, this is hardly true in practice: Medical practice is constantly evolving as clinical professionals fine-tune their knowledge over time. For instance, as the medical community\u2019s understanding of organ transplantations has progressed over the years, a pertinent question is: How have actual organ allocation policies been evolving? To give an answer, we desire a policy learning method that provides interpretable representations of decision-making, in particular capturing an agent\u2019s non-stationary knowledge of the world, as well as operating in an offline manner. First, we model the evolving behavior of decision-makers in terms of contextual bandits, and formalize the problem of Inverse Contextual Bandits (\"ICB\"). Second, we propose two concrete algorithms as solutions, learning parametric and non-parametric representations of an agent\u2019s behavior. Finally, using both real and simulated data for liver transplantations, we illustrate the applicability and explainability of our method, as well as benchmarking and validating the accuracy of our algorithms.", "bibtex": "@InProceedings{pmlr-v162-huyuk22a,\n title = \t {Inverse Contextual Bandits: Learning How Behavior Evolves over Time},\n author = {H{\\\"u}y{\\\"u}k, Alihan and Jarrett, Daniel and van der Schaar, Mihaela},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9506--9524},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huyuk22a/huyuk22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/huyuk22a.html},\n abstract = \t {Understanding a decision-maker\u2019s priorities by observing their behavior is critical for transparency and accountability in decision processes{\u2014}such as in healthcare. Though conventional approaches to policy learning almost invariably assume stationarity in behavior, this is hardly true in practice: Medical practice is constantly evolving as clinical professionals fine-tune their knowledge over time. For instance, as the medical community\u2019s understanding of organ transplantations has progressed over the years, a pertinent question is: How have actual organ allocation policies been evolving? To give an answer, we desire a policy learning method that provides interpretable representations of decision-making, in particular capturing an agent\u2019s non-stationary knowledge of the world, as well as operating in an offline manner. First, we model the evolving behavior of decision-makers in terms of contextual bandits, and formalize the problem of Inverse Contextual Bandits (\"ICB\"). Second, we propose two concrete algorithms as solutions, learning parametric and non-parametric representations of an agent\u2019s behavior. Finally, using both real and simulated data for liver transplantations, we illustrate the applicability and explainability of our method, as well as benchmarking and validating the accuracy of our algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/huyuk22a/huyuk22a.pdf", "supp": "", "pdf_size": 2381309, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14503123552543798761&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Applied Mathematics and Theoretical Physics, University of Cambridge, UK; Department of Applied Mathematics and Theoretical Physics, University of Cambridge, UK; Department of Applied Mathematics and Theoretical Physics, University of Cambridge, UK + Department of Electrical Engineering, University of California, Los Angeles, USA", "aff_domain": "cam.ac.uk; ; ", "email": "cam.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/huyuk22a.html", "aff_unique_index": "0;0;0+1", "aff_unique_norm": "University of Cambridge;University of California, Los Angeles", "aff_unique_dep": "Department of Applied Mathematics and Theoretical Physics;Department of Electrical Engineering", "aff_unique_url": "https://www.cam.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "Cambridge;UCLA", "aff_campus_unique_index": "0;0;0+1", "aff_campus_unique": "Cambridge;Los Angeles", "aff_country_unique_index": "0;0;0+1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Investigating Generalization by Controlling Normalized Margin", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17115", "id": "17115", "proceeding": "https://proceedings.mlr.press/v162/farhang22a.html", "poster": "/media/PosterPDFs/ICML%202022/201d546992726352471cfea6b0df0a48.png?t=1658078379.975189", "slides": "", "author_site": "Alexander Farhang, Jeremy Bernstein, Kushal Tirumala, Yang Liu, Yisong Yue", "author": "Alexander R Farhang; Jeremy D Bernstein; Kushal Tirumala; Yang Liu; Yisong Yue", "abstract": "Weight norm $\\|w\\|$ and margin $\\gamma$ participate in learning theory via the normalized margin $\\gamma/\\|w\\|$. Since standard neural net optimizers do not control normalized margin, it is hard to test whether this quantity causally relates to generalization. This paper designs a series of experimental studies that explicitly control normalized margin and thereby tackle two central questions. First: does normalized margin always have a causal effect on generalization? The paper finds that no\u2014networks can be produced where normalized margin has seemingly no relationship with generalization, counter to the theory of Bartlett et al. (2017). Second: does normalized margin ever have a causal effect on generalization? The paper finds that yes\u2014in a standard training setup, test performance closely tracks normalized margin. The paper suggests a Gaussian process model as a promising explanation for this behavior.", "bibtex": "@InProceedings{pmlr-v162-farhang22a,\n title = \t {Investigating Generalization by Controlling Normalized Margin},\n author = {Farhang, Alexander R and Bernstein, Jeremy D and Tirumala, Kushal and Liu, Yang and Yue, Yisong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6324--6336},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/farhang22a/farhang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/farhang22a.html},\n abstract = \t {Weight norm $\\|w\\|$ and margin $\\gamma$ participate in learning theory via the normalized margin $\\gamma/\\|w\\|$. Since standard neural net optimizers do not control normalized margin, it is hard to test whether this quantity causally relates to generalization. This paper designs a series of experimental studies that explicitly control normalized margin and thereby tackle two central questions. First: does normalized margin always have a causal effect on generalization? The paper finds that no\u2014networks can be produced where normalized margin has seemingly no relationship with generalization, counter to the theory of Bartlett et al. (2017). Second: does normalized margin ever have a causal effect on generalization? The paper finds that yes\u2014in a standard training setup, test performance closely tracks normalized margin. The paper suggests a Gaussian process model as a promising explanation for this behavior.}\n}", "pdf": "https://proceedings.mlr.press/v162/farhang22a/farhang22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/farhang22a-supp.zip", "pdf_size": 462032, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=715638377527231014&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Caltech; Caltech; Caltech; Argo AI; Caltech+Argo AI", "aff_domain": "caltech.edu;caltech.edu; ;argoai.com;caltech.edu", "email": "caltech.edu;caltech.edu; ;argoai.com;caltech.edu", "github": "https://github.com/alexfarhang/margin", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/farhang22a.html", "aff_unique_index": "0;0;0;1;0+1", "aff_unique_norm": "California Institute of Technology;Argo AI", "aff_unique_dep": ";", "aff_unique_url": "https://www.caltech.edu;https://www.argo.ai", "aff_unique_abbr": "Caltech;Argo AI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "Investigating Why Contrastive Learning Benefits Robustness against Label Noise", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17389", "id": "17389", "proceeding": "https://proceedings.mlr.press/v162/xue22a.html", "poster": "/media/PosterPDFs/ICML%202022/5857d68cd9280bc98d079fa912fd6740.png?t=1657171591.7605698", "slides": "", "author_site": "Yihao Xue, Kyle Whitecross, Baharan Mirzasoleiman", "author": "Yihao Xue; Kyle Whitecross; Baharan Mirzasoleiman", "abstract": "Self-supervised Contrastive Learning (CL) has been recently shown to be very effective in preventing deep networks from overfitting noisy labels. Despite its empirical success, the theoretical understanding of the effect of contrastive learning on boosting robustness is very limited. In this work, we rigorously prove that the representation matrix learned by contrastive learning boosts robustness, by having: (i) one prominent singular value corresponding to each sub-class in the data, and significantly smaller remaining singular values; and (ii) a large alignment between the prominent singular vectors and the clean labels of each sub-class. The above properties enable a linear layer trained on such representations to effectively learn the clean labels without overfitting the noise. We further show that the low-rank structure of the Jacobian of deep networks pre-trained with contrastive learning allows them to achieve a superior performance initially, when fine-tuned on noisy labels. Finally, we demonstrate that the initial robustness provided by contrastive learning enables robust training methods to achieve state-of-the-art performance under extreme noise levels, e.g., an average of 27.18% and 15.58% increase in accuracy on CIFAR-10 and CIFAR-100 with 80% symmetric noisy labels, and 4.11% increase in accuracy on WebVision.", "bibtex": "@InProceedings{pmlr-v162-xue22a,\n title = \t {Investigating Why Contrastive Learning Benefits Robustness against Label Noise},\n author = {Xue, Yihao and Whitecross, Kyle and Mirzasoleiman, Baharan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24851--24871},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xue22a/xue22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/xue22a.html},\n abstract = \t {Self-supervised Contrastive Learning (CL) has been recently shown to be very effective in preventing deep networks from overfitting noisy labels. Despite its empirical success, the theoretical understanding of the effect of contrastive learning on boosting robustness is very limited. In this work, we rigorously prove that the representation matrix learned by contrastive learning boosts robustness, by having: (i) one prominent singular value corresponding to each sub-class in the data, and significantly smaller remaining singular values; and (ii) a large alignment between the prominent singular vectors and the clean labels of each sub-class. The above properties enable a linear layer trained on such representations to effectively learn the clean labels without overfitting the noise. We further show that the low-rank structure of the Jacobian of deep networks pre-trained with contrastive learning allows them to achieve a superior performance initially, when fine-tuned on noisy labels. Finally, we demonstrate that the initial robustness provided by contrastive learning enables robust training methods to achieve state-of-the-art performance under extreme noise levels, e.g., an average of 27.18% and 15.58% increase in accuracy on CIFAR-10 and CIFAR-100 with 80% symmetric noisy labels, and 4.11% increase in accuracy on WebVision.}\n}", "pdf": "https://proceedings.mlr.press/v162/xue22a/xue22a.pdf", "supp": "", "pdf_size": 956812, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18334619009590852442&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, University of California, Los Angeles, CA 90095, USA; Department of Computer Science, University of California, Los Angeles, CA 90095, USA; Department of Computer Science, University of California, Los Angeles, CA 90095, USA", "aff_domain": "g.ucla.edu; ; ", "email": "g.ucla.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/xue22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Iterative Double Sketching for Faster Least-Squares Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17925", "id": "17925", "proceeding": "https://proceedings.mlr.press/v162/wang22t.html", "poster": "/media/PosterPDFs/ICML%202022/f60f6b0d129342bb6a226305aaf842b7.png?t=1658199274.6014867", "slides": "", "author_site": "Rui Wang, Yanyan Ouyang, Wangli Xu", "author": "Rui Wang; Yanyan Ouyang; Wangli Xu", "abstract": "This work is concerned with the overdetermined linear least-squares problem for large scale data. We generalize the iterative Hessian sketching (IHS) algorithm and propose a new sketching framework named iterative double sketching (IDS) which uses approximations for both the gradient and the Hessian in each iteration. To understand the behavior of the IDS algorithm and choose the optimal hyperparameters, we derive the exact limit of the conditional prediction error of the IDS algorithm in the setting of Gaussian sketching. Guided by this theoretical result, we propose an efficient IDS algorithm via a new class of sequentially related sketching matrices. We give a non-asymptotic analysis of this efficient IDS algorithm which shows that the proposed algorithm achieves the state-of-the-art trade-off between accuracy and efficiency.", "bibtex": "@InProceedings{pmlr-v162-wang22t,\n title = \t {Iterative Double Sketching for Faster Least-Squares Optimization},\n author = {Wang, Rui and Ouyang, Yanyan and Xu, Wangli},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22935--22963},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22t/wang22t.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22t.html},\n abstract = \t {This work is concerned with the overdetermined linear least-squares problem for large scale data. We generalize the iterative Hessian sketching (IHS) algorithm and propose a new sketching framework named iterative double sketching (IDS) which uses approximations for both the gradient and the Hessian in each iteration. To understand the behavior of the IDS algorithm and choose the optimal hyperparameters, we derive the exact limit of the conditional prediction error of the IDS algorithm in the setting of Gaussian sketching. Guided by this theoretical result, we propose an efficient IDS algorithm via a new class of sequentially related sketching matrices. We give a non-asymptotic analysis of this efficient IDS algorithm which shows that the proposed algorithm achieves the state-of-the-art trade-off between accuracy and efficiency.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22t/wang22t.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wang22t-supp.zip", "pdf_size": 570130, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17305999811460455038&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Center for Applied Statistics and School of Statistics, Renmin University of China, Beijing 100872, China; Center for Applied Statistics and School of Statistics, Renmin University of China, Beijing 100872, China; Center for Applied Statistics and School of Statistics, Renmin University of China, Beijing 100872, China", "aff_domain": "ruc.edu.cn; ; ", "email": "ruc.edu.cn; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22t.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Renmin University of China", "aff_unique_dep": "Center for Applied Statistics, School of Statistics", "aff_unique_url": "http://www.ruc.edu.cn", "aff_unique_abbr": "RUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Iterative Hard Thresholding with Adaptive Regularization: Sparser Solutions Without Sacrificing Runtime", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17705", "id": "17705", "proceeding": "https://proceedings.mlr.press/v162/axiotis22a.html", "poster": "", "slides": "", "author_site": "Kyriakos Axiotis \u00b7", "author": "Kyriakos Axiotis; Maxim Sviridenko", "abstract": "We propose a simple modification to the iterative hard thresholding (IHT) algorithm, which recovers asymptotically sparser solutions as a function of the condition number. When aiming to minimize a convex function f(x) with condition number $\\kappa$ subject to x being an s-sparse vector, the standard IHT guarantee is a solution with relaxed sparsity $O(s\\kappa^2)$, while our proposed algorithm, regularized IHT, returns a solution with sparsity $O(s\\kappa)$. Our algorithm significantly improves over ARHT [Axiotis & Sviridenko, 2021] which also achieves $O(s\\kappa)$, as it does not require re-optimization in each iteration (and so is much faster), is deterministic, and does not require knowledge of the optimal solution value f(x*) or the optimal sparsity level s. Our main technical tool is an adaptive regularization framework, in which the algorithm progressively learns the weights of an l_2 regularization term that will allow convergence to sparser solutions. We also apply this framework to low rank optimization, where we achieve a similar improvement of the best known condition number dependence from $\\kappa^2$ to $\\kappa$.", "bibtex": "@InProceedings{pmlr-v162-axiotis22a,\n title = \t {Iterative Hard Thresholding with Adaptive Regularization: Sparser Solutions Without Sacrificing Runtime},\n author = {Axiotis, Kyriakos and Sviridenko, Maxim},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1175--1197},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/axiotis22a/axiotis22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/axiotis22a.html},\n abstract = \t {We propose a simple modification to the iterative hard thresholding (IHT) algorithm, which recovers asymptotically sparser solutions as a function of the condition number. When aiming to minimize a convex function f(x) with condition number $\\kappa$ subject to x being an s-sparse vector, the standard IHT guarantee is a solution with relaxed sparsity $O(s\\kappa^2)$, while our proposed algorithm, regularized IHT, returns a solution with sparsity $O(s\\kappa)$. Our algorithm significantly improves over ARHT [Axiotis & Sviridenko, 2021] which also achieves $O(s\\kappa)$, as it does not require re-optimization in each iteration (and so is much faster), is deterministic, and does not require knowledge of the optimal solution value f(x*) or the optimal sparsity level s. Our main technical tool is an adaptive regularization framework, in which the algorithm progressively learns the weights of an l_2 regularization term that will allow convergence to sparser solutions. We also apply this framework to low rank optimization, where we achieve a similar improvement of the best known condition number dependence from $\\kappa^2$ to $\\kappa$.}\n}", "pdf": "https://proceedings.mlr.press/v162/axiotis22a/axiotis22a.pdf", "supp": "", "pdf_size": 580919, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15135997969936454097&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "MIT; Yahoo! Research", "aff_domain": "mit.edu;yahooinc.com", "email": "mit.edu;yahooinc.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/axiotis22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Yahoo!", "aff_unique_dep": ";Yahoo! Research", "aff_unique_url": "https://web.mit.edu;https://research.yahoo.com", "aff_unique_abbr": "MIT;Yahoo!", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "It\u2019s Raw! Audio Generation with State-Space Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17773", "id": "17773", "proceeding": "https://proceedings.mlr.press/v162/goel22a.html", "poster": "", "slides": "", "author_site": "Karan Goel, Albert Gu, Chris Donahue, Christopher Re", "author": "Karan Goel; Albert Gu; Chris Donahue; Christopher Re", "abstract": "Developing architectures suitable for modeling raw audio is a challenging problem due to the high sampling rates of audio waveforms. Standard sequence modeling approaches like RNNs and CNNs have previously been tailored to fit the demands of audio, but the resultant architectures make undesirable computational tradeoffs and struggle to model waveforms effectively. We propose SaShiMi, a new multi-scale architecture for waveform modeling built around the recently introduced S4 model for long sequence modeling. We identify that S4 can be unstable during autoregressive generation, and provide a simple improvement to its parameterization by drawing connections to Hurwitz matrices. SaShiMi yields state-of-the-art performance for unconditional waveform generation in the autoregressive setting. Additionally, SaShiMi improves non-autoregressive generation performance when used as the backbone architecture for a diffusion model. Compared to prior architectures in the autoregressive generation setting, SaShiMi generates piano and speech waveforms which humans find more musical and coherent respectively, e.g. 2{\\texttimes} better mean opinion scores than WaveNet on an unconditional speech generation task. On a music generation task, SaShiMi outperforms WaveNet on density estimation and speed at both training and inference even when using 3{\\texttimes} fewer parameters", "bibtex": "@InProceedings{pmlr-v162-goel22a,\n title = \t {It\u2019s Raw! {A}udio Generation with State-Space Models},\n author = {Goel, Karan and Gu, Albert and Donahue, Chris and Re, Christopher},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7616--7633},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/goel22a/goel22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/goel22a.html},\n abstract = \t {Developing architectures suitable for modeling raw audio is a challenging problem due to the high sampling rates of audio waveforms. Standard sequence modeling approaches like RNNs and CNNs have previously been tailored to fit the demands of audio, but the resultant architectures make undesirable computational tradeoffs and struggle to model waveforms effectively. We propose SaShiMi, a new multi-scale architecture for waveform modeling built around the recently introduced S4 model for long sequence modeling. We identify that S4 can be unstable during autoregressive generation, and provide a simple improvement to its parameterization by drawing connections to Hurwitz matrices. SaShiMi yields state-of-the-art performance for unconditional waveform generation in the autoregressive setting. Additionally, SaShiMi improves non-autoregressive generation performance when used as the backbone architecture for a diffusion model. Compared to prior architectures in the autoregressive generation setting, SaShiMi generates piano and speech waveforms which humans find more musical and coherent respectively, e.g. 2{\\texttimes} better mean opinion scores than WaveNet on an unconditional speech generation task. On a music generation task, SaShiMi outperforms WaveNet on density estimation and speed at both training and inference even when using 3{\\texttimes} fewer parameters}\n}", "pdf": "https://proceedings.mlr.press/v162/goel22a/goel22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/goel22a-supp.zip", "pdf_size": 3019345, "gs_citation": 256, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7324887611101847462&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University", "aff_domain": "cs.stanford.edu;stanford.edu; ; ", "email": "cs.stanford.edu;stanford.edu; ; ", "github": "", "project": "https://hazyresearch.stanford.edu/sashimi-examples", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/goel22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Kernel Methods for Radial Transformed Compositional Data with Many Zeros", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16213", "id": "16213", "proceeding": "https://proceedings.mlr.press/v162/park22d.html", "poster": "/media/PosterPDFs/ICML%202022/080c993fb3b58e26c1d2265bf9da0af3.png?t=1657724010.5204327", "slides": "/media/icml-2022/Slides/16213.pdf", "author_site": "Junyoung Park, Changwon Yoon, Cheolwoo Park, Jeongyoun Ahn", "author": "Junyoung Park; Changwon Yoon; Cheolwoo Park; Jeongyoun Ahn", "abstract": "Compositional data analysis with a high proportion of zeros has gained increasing popularity, especially in chemometrics and human gut microbiomes research. Statistical analyses of this type of data are typically carried out via a log-ratio transformation after replacing zeros with small positive values. We should note, however, that this procedure is geometrically improper, as it causes anomalous distortions through the transformation. We propose a radial transformation that does not require zero substitutions and more importantly results in essential equivalence between domains before and after the transformation. We show that a rich class of kernels on hyperspheres can successfully define a kernel embedding for compositional data based on this equivalence. To the best of our knowledge, this is the first work that theoretically establishes the availability of the extensive library of kernel-based machine learning methods for compositional data. The applicability of the proposed approach is demonstrated with kernel principal component analysis.", "bibtex": "@InProceedings{pmlr-v162-park22d,\n title = \t {Kernel Methods for Radial Transformed Compositional Data with Many Zeros},\n author = {Park, Junyoung and Yoon, Changwon and Park, Cheolwoo and Ahn, Jeongyoun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17458--17472},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/park22d/park22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/park22d.html},\n abstract = \t {Compositional data analysis with a high proportion of zeros has gained increasing popularity, especially in chemometrics and human gut microbiomes research. Statistical analyses of this type of data are typically carried out via a log-ratio transformation after replacing zeros with small positive values. We should note, however, that this procedure is geometrically improper, as it causes anomalous distortions through the transformation. We propose a radial transformation that does not require zero substitutions and more importantly results in essential equivalence between domains before and after the transformation. We show that a rich class of kernels on hyperspheres can successfully define a kernel embedding for compositional data based on this equivalence. To the best of our knowledge, this is the first work that theoretically establishes the availability of the extensive library of kernel-based machine learning methods for compositional data. The applicability of the proposed approach is demonstrated with kernel principal component analysis.}\n}", "pdf": "https://proceedings.mlr.press/v162/park22d/park22d.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/park22d-supp.zip", "pdf_size": 2146208, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10968612283529907506&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": "Department of Mathematical Sciences, KAIST, Daejeon, Korea; Department of Industrial & Systems Engineering, KAIST, Daejeon, Korea; Department of Mathematical Sciences, KAIST, Daejeon, Korea; Department of Industrial & Systems Engineering, KAIST, Daejeon, Korea", "aff_domain": "kaist.ac.kr; ;kaist.ac.kr;kaist.ac.kr", "email": "kaist.ac.kr; ;kaist.ac.kr;kaist.ac.kr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/park22d.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "KAIST", "aff_unique_dep": "Department of Mathematical Sciences", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Daejeon", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Kernelized Multiplicative Weights for 0/1-Polyhedral Games: Bridging the Gap Between Learning in Extensive-Form and Normal-Form Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17179", "id": "17179", "proceeding": "https://proceedings.mlr.press/v162/farina22a.html", "poster": "", "slides": "", "author_site": "Gabriele Farina, Chung-Wei Lee, Haipeng Luo, Christian Kroer", "author": "Gabriele Farina; Chung-Wei Lee; Haipeng Luo; Christian Kroer", "abstract": "While extensive-form games (EFGs) can be converted into normal-form games (NFGs), doing so comes at the cost of an exponential blowup of the strategy space. So, progress on NFGs and EFGs has historically followed separate tracks, with the EFG community often having to catch up with advances (\\eg last-iterate convergence and predictive regret bounds) from the larger NFG community. In this paper we show that the Optimistic Multiplicative Weights Update (OMWU) algorithm\u2014the premier learning algorithm for NFGs\u2014can be simulated on the normal-form equivalent of an EFG in linear time per iteration in the game tree size using a kernel trick. The resulting algorithm,", "bibtex": "@InProceedings{pmlr-v162-farina22a,\n title = \t {Kernelized Multiplicative Weights for 0/1-Polyhedral Games: Bridging the Gap Between Learning in Extensive-Form and Normal-Form Games},\n author = {Farina, Gabriele and Lee, Chung-Wei and Luo, Haipeng and Kroer, Christian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6337--6357},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/farina22a/farina22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/farina22a.html},\n abstract = \t {While extensive-form games (EFGs) can be converted into normal-form games (NFGs), doing so comes at the cost of an exponential blowup of the strategy space. So, progress on NFGs and EFGs has historically followed separate tracks, with the EFG community often having to catch up with advances (\\eg last-iterate convergence and predictive regret bounds) from the larger NFG community. In this paper we show that the Optimistic Multiplicative Weights Update (OMWU) algorithm\u2014the premier learning algorithm for NFGs\u2014can be simulated on the normal-form equivalent of an EFG in linear time per iteration in the game tree size using a kernel trick. The resulting algorithm,", "pdf": "https://proceedings.mlr.press/v162/farina22a/farina22a.pdf", "supp": "", "pdf_size": 755669, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1695762167595620924&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Computer Science Department, Carnegie Mellon University; Computer Science Department, University of Southern California; Computer Science Department, University of Southern California; IEOR Department, Columbia University", "aff_domain": "cs.cmu.edu;usc.edu;usc.edu;columbia.edu", "email": "cs.cmu.edu;usc.edu;usc.edu;columbia.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/farina22a.html", "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Carnegie Mellon University;University of Southern California;Columbia University", "aff_unique_dep": "Computer Science Department;Computer Science Department;IEOR Department", "aff_unique_url": "https://www.cmu.edu;https://www.usc.edu;https://www.columbia.edu", "aff_unique_abbr": "CMU;USC;Columbia", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Kill a Bird with Two Stones: Closing the Convergence Gaps in Non-Strongly Convex Optimization by Directly Accelerated SVRG with Double Compensation and Snapshots", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18383", "id": "18383", "proceeding": "https://proceedings.mlr.press/v162/liu22q.html", "poster": "/media/PosterPDFs/ICML%202022/495dabfd0ca768a3c3abd672079f48b6.png?t=1657513391.6093986", "slides": "", "author_site": "Yuanyuan Liu, Fanhua Shang, Weixin An, Hongying Liu, Zhouchen Lin", "author": "Yuanyuan Liu; Fanhua Shang; Weixin An; Hongying Liu; Zhouchen Lin", "abstract": "Recently, some accelerated stochastic variance reduction algorithms such as Katyusha and ASVRG-ADMM achieve faster convergence than non-accelerated methods such as SVRG and SVRG-ADMM. However, there are still some gaps between the oracle complexities and their lower bounds. To fill in these gaps, this paper proposes a novel Directly Accelerated stochastic Variance reductIon (DAVIS) algorithm with two Snapshots for non-strongly convex (non-SC) unconstrained problems. Our theoretical results show that DAVIS achieves the optimal convergence rate O(1/(nS^2)) and optimal gradient complexity O(n+\\sqrt{nL/\\epsilon}), which is identical to its lower bound. To the best of our knowledge, this is the first directly accelerated algorithm that attains the optimal lower bound and improves the convergence rate from O(1/S^2) to O(1/(nS^2)). Moreover, we extend DAVIS and theoretical results to non-SC problems with a structured regularizer, and prove that the proposed algorithm with double-snapshots also attains the optimal convergence rate O(1/(nS)) and optimal oracle complexity O(n+L/\\epsilon) for such problems, and it is at least a factor n/S faster than existing accelerated stochastic algorithms, where n\\gg S in general.", "bibtex": "@InProceedings{pmlr-v162-liu22q,\n title = \t {Kill a Bird with Two Stones: Closing the Convergence Gaps in Non-Strongly Convex Optimization by Directly Accelerated {SVRG} with Double Compensation and Snapshots},\n author = {Liu, Yuanyuan and Shang, Fanhua and An, Weixin and Liu, Hongying and Lin, Zhouchen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14008--14035},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22q/liu22q.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22q.html},\n abstract = \t {Recently, some accelerated stochastic variance reduction algorithms such as Katyusha and ASVRG-ADMM achieve faster convergence than non-accelerated methods such as SVRG and SVRG-ADMM. However, there are still some gaps between the oracle complexities and their lower bounds. To fill in these gaps, this paper proposes a novel Directly Accelerated stochastic Variance reductIon (DAVIS) algorithm with two Snapshots for non-strongly convex (non-SC) unconstrained problems. Our theoretical results show that DAVIS achieves the optimal convergence rate O(1/(nS^2)) and optimal gradient complexity O(n+\\sqrt{nL/\\epsilon}), which is identical to its lower bound. To the best of our knowledge, this is the first directly accelerated algorithm that attains the optimal lower bound and improves the convergence rate from O(1/S^2) to O(1/(nS^2)). Moreover, we extend DAVIS and theoretical results to non-SC problems with a structured regularizer, and prove that the proposed algorithm with double-snapshots also attains the optimal convergence rate O(1/(nS)) and optimal oracle complexity O(n+L/\\epsilon) for such problems, and it is at least a factor n/S faster than existing accelerated stochastic algorithms, where n\\gg S in general.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22q/liu22q.pdf", "supp": "", "pdf_size": 734312, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13239335065828146769&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Key Lab. of Intelligent Perception and Image Understanding of Ministry of Education, School of Artificial Intelligence, Xidian University, China; School of Computer Science and Technology, College of Intelligence and Computing, Tianjin University, Tianjin, 300350, China; Peng Cheng Laboratory; Key Lab. of Machine Perception (MoE), School of Artificial Intelligence, Peking University; Institute for Artificial Intelligence, Peking University", "aff_domain": "foxmail.com; ; ; ; ", "email": "foxmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/liu22q.html", "aff_unique_index": "0;1;2;3;3", "aff_unique_norm": "Xidian University;Tianjin University;Pengcheng Laboratory;Peking University", "aff_unique_dep": "School of Artificial Intelligence;School of Computer Science and Technology;Peng Cheng Laboratory;School of Artificial Intelligence", "aff_unique_url": "http://www.xidian.edu.cn/;http://www.tju.edu.cn;http://www.pcl.ac.cn;http://www.pku.edu.cn", "aff_unique_abbr": "Xidian;Tianjin U;PCL;PKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Tianjin", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Knowledge Base Question Answering by Case-based Reasoning over Subgraphs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18311", "id": "18311", "proceeding": "https://proceedings.mlr.press/v162/das22a.html", "poster": "/media/PosterPDFs/ICML%202022/0ad5292c158f3924f8b480367fcbeb94.png?t=1658183390.1251626", "slides": "/media/icml-2022/Slides/18311.pdf", "author_site": "Rajarshi Das, Ameya Godbole, Ankita Rajaram Naik, Elliot Tower, Manzil Zaheer, Hannaneh Hajishirzi, Robin Jia, Andrew McCallum", "author": "Rajarshi Das; Ameya Godbole; Ankita Naik; Elliot Tower; Manzil Zaheer; Hannaneh Hajishirzi; Robin Jia; Andrew Mccallum", "abstract": "Question answering (QA) over knowledge bases (KBs) is challenging because of the diverse, essentially unbounded, types of reasoning patterns needed. However, we hypothesize in a large KB, reasoning patterns required to answer a query type reoccur for various entities in their respective subgraph neighborhoods. Leveraging this structural similarity between local neighborhoods of different subgraphs, we introduce a semiparametric model (CBR-SUBG) with (i) a nonparametric component that for each query, dynamically retrieves other similar $k$-nearest neighbor (KNN) training queries along with query-specific subgraphs and (ii) a parametric component that is trained to identify the (latent) reasoning patterns from the subgraphs of KNN queries and then apply them to the subgraph of the target query. We also propose an adaptive subgraph collection strategy to select a query-specific compact subgraph, allowing us to scale to full Freebase KB containing billions of facts. We show that CBR-SUBG can answer queries requiring subgraph reasoning patterns and performs competitively with the best models on several KBQA benchmarks. Our subgraph collection strategy also produces more compact subgraphs (e.g. 55% reduction in size for WebQSP while increasing answer recall by 4.85%)\\footnote{Code, model, and subgraphs are available at \\url{https://github.com/rajarshd/CBR-SUBG}}.", "bibtex": "@InProceedings{pmlr-v162-das22a,\n title = \t {Knowledge Base Question Answering by Case-based Reasoning over Subgraphs},\n author = {Das, Rajarshi and Godbole, Ameya and Naik, Ankita and Tower, Elliot and Zaheer, Manzil and Hajishirzi, Hannaneh and Jia, Robin and Mccallum, Andrew},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4777--4793},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/das22a/das22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/das22a.html},\n abstract = \t {Question answering (QA) over knowledge bases (KBs) is challenging because of the diverse, essentially unbounded, types of reasoning patterns needed. However, we hypothesize in a large KB, reasoning patterns required to answer a query type reoccur for various entities in their respective subgraph neighborhoods. Leveraging this structural similarity between local neighborhoods of different subgraphs, we introduce a semiparametric model (CBR-SUBG) with (i) a nonparametric component that for each query, dynamically retrieves other similar $k$-nearest neighbor (KNN) training queries along with query-specific subgraphs and (ii) a parametric component that is trained to identify the (latent) reasoning patterns from the subgraphs of KNN queries and then apply them to the subgraph of the target query. We also propose an adaptive subgraph collection strategy to select a query-specific compact subgraph, allowing us to scale to full Freebase KB containing billions of facts. We show that CBR-SUBG can answer queries requiring subgraph reasoning patterns and performs competitively with the best models on several KBQA benchmarks. Our subgraph collection strategy also produces more compact subgraphs (e.g. 55% reduction in size for WebQSP while increasing answer recall by 4.85%)\\footnote{Code, model, and subgraphs are available at \\url{https://github.com/rajarshd/CBR-SUBG}}.}\n}", "pdf": "https://proceedings.mlr.press/v162/das22a/das22a.pdf", "supp": "", "pdf_size": 559278, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9521902592444277767&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Washington; University of Southern California; UMass Amherst; Google DeepMind; University of Southern California; Google DeepMind; University of Washington; UMass Amherst", "aff_domain": "cs.washington.edu;usc.edu; ; ; ; ;cs.washington.edu; ", "email": "cs.washington.edu;usc.edu; ; ; ; ;cs.washington.edu; ", "github": "https://github.com/rajarshd/CBR-SUBG", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/das22a.html", "aff_unique_index": "0;1;2;3;1;3;0;2", "aff_unique_norm": "University of Washington;University of Southern California;University of Massachusetts Amherst;Google", "aff_unique_dep": ";;;Google DeepMind", "aff_unique_url": "https://www.washington.edu;https://www.usc.edu;https://www.umass.edu;https://deepmind.com", "aff_unique_abbr": "UW;USC;UMass Amherst;DeepMind", "aff_campus_unique_index": "1;2;1;2", "aff_campus_unique": ";Los Angeles;Amherst", "aff_country_unique_index": "0;0;0;1;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Knowledge-Grounded Self-Rationalization via Extractive and Natural Language Explanations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18257", "id": "18257", "proceeding": "https://proceedings.mlr.press/v162/majumder22a.html", "poster": "/media/PosterPDFs/ICML%202022/b0d336331ae52d66769bb655c37032c6_ea4k7A2.png?t=1658058689.9259274", "slides": "", "author_site": "Bodhisattwa Prasad Majumder, Oana-Maria Camburu, Thomas Lukasiewicz, Julian McAuley", "author": "Bodhisattwa Prasad Majumder; Oana Camburu; Thomas Lukasiewicz; Julian Mcauley", "abstract": "Models that generate extractive rationales (i.e., subsets of features) or natural language explanations (NLEs) for their predictions are important for explainable AI. While an extractive rationale provides a quick view of the features most responsible for a prediction, an NLE allows for a comprehensive description of the decision-making process behind a prediction. However, current models that generate the best extractive rationales or NLEs often fall behind the state-of-the-art (SOTA) in terms of task performance. In this work, we bridge this gap by introducing RExC, a self-rationalizing framework that grounds its predictions and two complementary types of explanations (NLEs and extractive rationales) in background knowledge. Our framework improves over previous methods by: (i) reaching SOTA task performance while also providing explanations, (ii) providing two types of explanations, while existing models usually provide only one type, and (iii) beating by a large margin the previous SOTA in terms of quality of both types of explanations. Furthermore, a perturbation analysis in RExC shows a high degree of association between explanations and predictions, a necessary property of faithful explanations.", "bibtex": "@InProceedings{pmlr-v162-majumder22a,\n title = \t {Knowledge-Grounded Self-Rationalization via Extractive and Natural Language Explanations},\n author = {Majumder, Bodhisattwa Prasad and Camburu, Oana and Lukasiewicz, Thomas and Mcauley, Julian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14786--14801},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/majumder22a/majumder22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/majumder22a.html},\n abstract = \t {Models that generate extractive rationales (i.e., subsets of features) or natural language explanations (NLEs) for their predictions are important for explainable AI. While an extractive rationale provides a quick view of the features most responsible for a prediction, an NLE allows for a comprehensive description of the decision-making process behind a prediction. However, current models that generate the best extractive rationales or NLEs often fall behind the state-of-the-art (SOTA) in terms of task performance. In this work, we bridge this gap by introducing RExC, a self-rationalizing framework that grounds its predictions and two complementary types of explanations (NLEs and extractive rationales) in background knowledge. Our framework improves over previous methods by: (i) reaching SOTA task performance while also providing explanations, (ii) providing two types of explanations, while existing models usually provide only one type, and (iii) beating by a large margin the previous SOTA in terms of quality of both types of explanations. Furthermore, a perturbation analysis in RExC shows a high degree of association between explanations and predictions, a necessary property of faithful explanations.}\n}", "pdf": "https://proceedings.mlr.press/v162/majumder22a/majumder22a.pdf", "supp": "", "pdf_size": 12593569, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17002815851190376323&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science and Engineering, UC San Diego, USA; Department of Computer Science, University of Oxford, UK; Institute of Logic and Computation, TU Wien, Austria; Department of Computer Science and Engineering, UC San Diego, USA", "aff_domain": "eng.ucsd.edu; ; ; ", "email": "eng.ucsd.edu; ; ; ", "github": "https://github.com/majumderb/rexc", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/majumder22a.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of California, San Diego;University of Oxford;TU Wien", "aff_unique_dep": "Department of Computer Science and Engineering;Department of Computer Science;Institute of Logic and Computation", "aff_unique_url": "https://www.ucsd.edu;https://www.ox.ac.uk;https://www.tuwien.ac.at", "aff_unique_abbr": "UCSD;Oxford;TU Wien", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "United States;United Kingdom;Austria" }, { "title": "Koopman Q-learning: Offline Reinforcement Learning via Symmetries of Dynamics", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17725", "id": "17725", "proceeding": "https://proceedings.mlr.press/v162/weissenbacher22a.html", "poster": "/media/PosterPDFs/ICML%202022/6616758da438b02b8d360ad83a5b3d77_L4BuSms.png?t=1657347620.8795366", "slides": "", "author_site": "Matthias Weissenbacher, Samrath Sinha, Animesh Garg, Yoshinobu Kawahara", "author": "Matthias Weissenbacher; Samarth Sinha; Animesh Garg; Kawahara Yoshinobu", "abstract": "Offline reinforcement learning leverages large datasets to train policies without interactions with the environment. The learned policies may then be deployed in real-world settings where interactions are costly or dangerous. Current algorithms over-fit to the training dataset and as a consequence perform poorly when deployed to out-of-distribution generalizations of the environment. We aim to address these limitations by learning a Koopman latent representation which allows us to infer symmetries of the system\u2019s underlying dynamic. The latter is then utilized to extend the otherwise static offline dataset during training; this constitutes a novel data augmentation framework which reflects the system\u2019s dynamic and is thus to be interpreted as an exploration of the environments phase space. To obtain the symmetries we employ Koopman theory in which nonlinear dynamics are represented in terms of a linear operator acting on the space of measurement functions of the system. We provide novel theoretical results on the existence and nature of symmetries relevant for control systems such as reinforcement learning settings. Moreover, we empirically evaluate our method on several benchmark offline reinforcement learning tasks and datasets including D4RL, Metaworld and Robosuite and find that by using our framework we consistently improve the state-of-the-art of model-free Q-learning methods.", "bibtex": "@InProceedings{pmlr-v162-weissenbacher22a,\n title = \t {Koopman Q-learning: Offline Reinforcement Learning via Symmetries of Dynamics},\n author = {Weissenbacher, Matthias and Sinha, Samarth and Garg, Animesh and Yoshinobu, Kawahara},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23645--23667},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/weissenbacher22a/weissenbacher22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/weissenbacher22a.html},\n abstract = \t {Offline reinforcement learning leverages large datasets to train policies without interactions with the environment. The learned policies may then be deployed in real-world settings where interactions are costly or dangerous. Current algorithms over-fit to the training dataset and as a consequence perform poorly when deployed to out-of-distribution generalizations of the environment. We aim to address these limitations by learning a Koopman latent representation which allows us to infer symmetries of the system\u2019s underlying dynamic. The latter is then utilized to extend the otherwise static offline dataset during training; this constitutes a novel data augmentation framework which reflects the system\u2019s dynamic and is thus to be interpreted as an exploration of the environments phase space. To obtain the symmetries we employ Koopman theory in which nonlinear dynamics are represented in terms of a linear operator acting on the space of measurement functions of the system. We provide novel theoretical results on the existence and nature of symmetries relevant for control systems such as reinforcement learning settings. Moreover, we empirically evaluate our method on several benchmark offline reinforcement learning tasks and datasets including D4RL, Metaworld and Robosuite and find that by using our framework we consistently improve the state-of-the-art of model-free Q-learning methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/weissenbacher22a/weissenbacher22a.pdf", "supp": "", "pdf_size": 1124746, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2153236057017473258&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "RIKEN Center for Advanced Intelligence Project, Japan; Vector Institute, University of Toronto, Canada; Vector Institute, University of Toronto, Canada; RIKEN Center for Advanced Intelligence Project, Japan+Institute of Mathematics for Industry, Kyushu University, Japan", "aff_domain": "a.riken.jp; ; ; ", "email": "a.riken.jp; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/weissenbacher22a.html", "aff_unique_index": "0;1;1;0+2", "aff_unique_norm": "RIKEN Center for Advanced Intelligence Project;University of Toronto;Kyushu University", "aff_unique_dep": "Center for Advanced Intelligence Project;Vector Institute;Institute of Mathematics for Industry", "aff_unique_url": "https://www.riken.jp/en/c-aip/;https://www.vectorinstitute.ai;https://www.kyushu-u.ac.jp", "aff_unique_abbr": "RIKEN C-AIP;U of T;Kyushu U", "aff_campus_unique_index": "1;1;", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;1;1;0+0", "aff_country_unique": "Japan;Canada" }, { "title": "LCANets: Lateral Competition Improves Robustness Against Corruption and Attack", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17159", "id": "17159", "proceeding": "https://proceedings.mlr.press/v162/teti22a.html", "poster": "/media/PosterPDFs/ICML%202022/54eea69746513c0b90bbe6227b6f46c3_g6U3VDU.png?t=1657744117.5308828", "slides": "", "author_site": "Michael Teti, Garrett T Kenyon, Benjamin Migliori, Juston Moore", "author": "Michael Teti; Garrett Kenyon; Ben Migliori; Juston Moore", "abstract": "Although Convolutional Neural Networks (CNNs) achieve high accuracy on image recognition tasks, they lack robustness against realistic corruptions and fail catastrophically when deliberately attacked. Previous CNNs with representations similar to primary visual cortex (V1) were more robust to adversarial attacks on images than current adversarial defense techniques, but they required training on large-scale neural recordings or handcrafting neuroscientific models. Motivated by evidence that neural activity in V1 is sparse, we develop a class of hybrid CNNs, called LCANets, which feature a frontend that performs sparse coding via local lateral competition. We demonstrate that LCANets achieve competitive clean accuracy to standard CNNs on action and image recognition tasks and significantly greater accuracy under various image corruptions. We also perform the first adversarial attacks with full knowledge of a sparse coding CNN layer by attacking LCANets with white-box and black-box attacks, and we show that, contrary to previous hypotheses, sparse coding layers are not very robust to white-box attacks. Finally, we propose a way to use sparse coding layers as a plug-and-play robust frontend by showing that they significantly increase the robustness of adversarially-trained CNNs over corruptions and attacks.", "bibtex": "@InProceedings{pmlr-v162-teti22a,\n title = \t {{LCAN}ets: Lateral Competition Improves Robustness Against Corruption and Attack},\n author = {Teti, Michael and Kenyon, Garrett and Migliori, Ben and Moore, Juston},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21232--21252},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/teti22a/teti22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/teti22a.html},\n abstract = \t {Although Convolutional Neural Networks (CNNs) achieve high accuracy on image recognition tasks, they lack robustness against realistic corruptions and fail catastrophically when deliberately attacked. Previous CNNs with representations similar to primary visual cortex (V1) were more robust to adversarial attacks on images than current adversarial defense techniques, but they required training on large-scale neural recordings or handcrafting neuroscientific models. Motivated by evidence that neural activity in V1 is sparse, we develop a class of hybrid CNNs, called LCANets, which feature a frontend that performs sparse coding via local lateral competition. We demonstrate that LCANets achieve competitive clean accuracy to standard CNNs on action and image recognition tasks and significantly greater accuracy under various image corruptions. We also perform the first adversarial attacks with full knowledge of a sparse coding CNN layer by attacking LCANets with white-box and black-box attacks, and we show that, contrary to previous hypotheses, sparse coding layers are not very robust to white-box attacks. Finally, we propose a way to use sparse coding layers as a plug-and-play robust frontend by showing that they significantly increase the robustness of adversarially-trained CNNs over corruptions and attacks.}\n}", "pdf": "https://proceedings.mlr.press/v162/teti22a/teti22a.pdf", "supp": "", "pdf_size": 1290987, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15616024995948669354&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Los Alamos National Laboratory; Los Alamos National Laboratory; Los Alamos National Laboratory; Los Alamos National Laboratory", "aff_domain": "lanl.gov; ; ; ", "email": "lanl.gov; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/teti22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Los Alamos National Laboratory", "aff_unique_dep": "", "aff_unique_url": "https://www.lanl.gov", "aff_unique_abbr": "LANL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "LIDL: Local Intrinsic Dimension Estimation Using Approximate Likelihood", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18215", "id": "18215", "proceeding": "https://proceedings.mlr.press/v162/tempczyk22a.html", "poster": "/media/PosterPDFs/ICML%202022/47f91db40efc6a22350eca5c953c4742_SH6W0W6.png?t=1658407141.93952", "slides": "/media/icml-2022/Slides/18215.pdf", "author_site": "Piotr Tempczyk, Rafa\u0142 Michaluk, \u0141ukasz Garncarek, Przemys\u0142aw Spurek, Jacek Tabor, Adam Golinski", "author": "Piotr Tempczyk; Rafa\u0142 Michaluk; Lukasz Garncarek; Przemys\u0142aw Spurek; Jacek Tabor; Adam Golinski", "abstract": "Most of the existing methods for estimating the local intrinsic dimension of a data distribution do not scale well to high dimensional data. Many of them rely on a non-parametric nearest neighbours approach which suffers from the curse of dimensionality. We attempt to address that challenge by proposing a novel approach to the problem: Local Intrinsic Dimension estimation using approximate Likelihood (LIDL). Our method relies on an arbitrary density estimation method as its subroutine, and hence tries to sidestep the dimensionality challenge by making use of the recent progress in parametric neural methods for likelihood estimation. We carefully investigate the empirical properties of the proposed method, compare them with our theoretical predictions, show that LIDL yields competitive results on the standard benchmarks for this problem, and that it scales to thousands of dimensions. What is more, we anticipate this approach to improve further with the continuing advances in the density estimation literature.", "bibtex": "@InProceedings{pmlr-v162-tempczyk22a,\n title = \t {{LIDL}: Local Intrinsic Dimension Estimation Using Approximate Likelihood},\n author = {Tempczyk, Piotr and Michaluk, Rafa{\\l} and Garncarek, Lukasz and Spurek, Przemys{\\l}aw and Tabor, Jacek and Golinski, Adam},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21205--21231},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tempczyk22a/tempczyk22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tempczyk22a.html},\n abstract = \t {Most of the existing methods for estimating the local intrinsic dimension of a data distribution do not scale well to high dimensional data. Many of them rely on a non-parametric nearest neighbours approach which suffers from the curse of dimensionality. We attempt to address that challenge by proposing a novel approach to the problem: Local Intrinsic Dimension estimation using approximate Likelihood (LIDL). Our method relies on an arbitrary density estimation method as its subroutine, and hence tries to sidestep the dimensionality challenge by making use of the recent progress in parametric neural methods for likelihood estimation. We carefully investigate the empirical properties of the proposed method, compare them with our theoretical predictions, show that LIDL yields competitive results on the standard benchmarks for this problem, and that it scales to thousands of dimensions. What is more, we anticipate this approach to improve further with the continuing advances in the density estimation literature.}\n}", "pdf": "https://proceedings.mlr.press/v162/tempczyk22a/tempczyk22a.pdf", "supp": "", "pdf_size": 6045000, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9636618006452252616&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Institute of Informatics, University of Warsaw+Polish National Institute for Machine Learning+deeptale.ai; Institute of Informatics, University of Warsaw+Polish National Institute for Machine Learning+Applica; Polish National Institute for Machine Learning+Applica; GMUM, Jagiellonian University; GMUM, Jagiellonian University; University of Oxford", "aff_domain": "mimuw.edu.pl; ; ; ; ; ", "email": "mimuw.edu.pl; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/tempczyk22a.html", "aff_unique_index": "0+1+2;0+1;1;4;4;5", "aff_unique_norm": "University of Warsaw;Polish National Institute for Machine Learning;Deeptale AI;;Jagiellonian University;University of Oxford", "aff_unique_dep": "Institute of Informatics;;;;GMUM;", "aff_unique_url": "https://www.uw.edu.pl;;https://www.deeptale.ai;;https://www.uj.edu.pl;https://www.ox.ac.uk", "aff_unique_abbr": "UW;;Deeptale AI;;;Oxford", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+1;0+0;0;0;0;3", "aff_country_unique": "Poland;United States;;United Kingdom" }, { "title": "LIMO: Latent Inceptionism for Targeted Molecule Generation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16741", "id": "16741", "proceeding": "https://proceedings.mlr.press/v162/eckmann22a.html", "poster": "/media/PosterPDFs/ICML%202022/e069ea4c9c233d36ff9c7f329bc08ff1_0sE5cIx.png?t=1657215859.7950253", "slides": "", "author_site": "Peter Eckmann, Kunyang Sun, Bo Zhao, Mudong Feng, Michael Gilson, Rose Yu", "author": "Peter Eckmann; Kunyang Sun; Bo Zhao; Mudong Feng; Michael Gilson; Rose Yu", "abstract": "Generation of drug-like molecules with high binding affinity to target proteins remains a difficult and resource-intensive task in drug discovery. Existing approaches primarily employ reinforcement learning, Markov sampling, or deep generative models guided by Gaussian processes, which can be prohibitively slow when generating molecules with high binding affinity calculated by computationally-expensive physics-based methods. We present Latent Inceptionism on Molecules (LIMO), which significantly accelerates molecule generation with an inceptionism-like technique. LIMO employs a variational autoencoder-generated latent space and property prediction by two neural networks in sequence to enable faster gradient-based reverse-optimization of molecular properties. Comprehensive experiments show that LIMO performs competitively on benchmark tasks and markedly outperforms state-of-the-art techniques on the novel task of generating drug-like compounds with high binding affinity, reaching nanomolar range against two protein targets. We corroborate these docking-based results with more accurate molecular dynamics-based calculations of absolute binding free energy and show that one of our generated drug-like compounds has a predicted $K_D$ (a measure of binding affinity) of $6 \\cdot 10^{-14}$ M against the human estrogen receptor, well beyond the affinities of typical early-stage drug candidates and most FDA-approved drugs to their respective targets. Code is available at https://github.com/Rose-STL-Lab/LIMO.", "bibtex": "@InProceedings{pmlr-v162-eckmann22a,\n title = \t {{LIMO}: Latent Inceptionism for Targeted Molecule Generation},\n author = {Eckmann, Peter and Sun, Kunyang and Zhao, Bo and Feng, Mudong and Gilson, Michael and Yu, Rose},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5777--5792},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/eckmann22a/eckmann22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/eckmann22a.html},\n abstract = \t {Generation of drug-like molecules with high binding affinity to target proteins remains a difficult and resource-intensive task in drug discovery. Existing approaches primarily employ reinforcement learning, Markov sampling, or deep generative models guided by Gaussian processes, which can be prohibitively slow when generating molecules with high binding affinity calculated by computationally-expensive physics-based methods. We present Latent Inceptionism on Molecules (LIMO), which significantly accelerates molecule generation with an inceptionism-like technique. LIMO employs a variational autoencoder-generated latent space and property prediction by two neural networks in sequence to enable faster gradient-based reverse-optimization of molecular properties. Comprehensive experiments show that LIMO performs competitively on benchmark tasks and markedly outperforms state-of-the-art techniques on the novel task of generating drug-like compounds with high binding affinity, reaching nanomolar range against two protein targets. We corroborate these docking-based results with more accurate molecular dynamics-based calculations of absolute binding free energy and show that one of our generated drug-like compounds has a predicted $K_D$ (a measure of binding affinity) of $6 \\cdot 10^{-14}$ M against the human estrogen receptor, well beyond the affinities of typical early-stage drug candidates and most FDA-approved drugs to their respective targets. Code is available at https://github.com/Rose-STL-Lab/LIMO.}\n}", "pdf": "https://proceedings.mlr.press/v162/eckmann22a/eckmann22a.pdf", "supp": "", "pdf_size": 10844898, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12167942813454300503&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "Department of Computer Science and Engineering, UC San Diego; Department of Chemistry and Biochemistry, UC San Diego; Department of Computer Science and Engineering, UC San Diego + Skaggs School of Pharmacy and Pharmaceutical Sciences, UC San Diego; Department of Chemistry and Biochemistry, UC San Diego + Skaggs School of Pharmacy and Pharmaceutical Sciences, UC San Diego; Department of Chemistry and Biochemistry, UC San Diego + Skaggs School of Pharmacy and Pharmaceutical Sciences, UC San Diego; Department of Computer Science and Engineering, UC San Diego", "aff_domain": "health.ucsd.edu;ucsd.edu; ; ; ; ", "email": "health.ucsd.edu;ucsd.edu; ; ; ; ", "github": "https://github.com/Rose-STL-Lab/LIMO", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/eckmann22a.html", "aff_unique_index": "0;0;0+0;0+0;0+0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0;0+0;0+0;0+0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0+0;0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "LSB: Local Self-Balancing MCMC in Discrete Spaces", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16689", "id": "16689", "proceeding": "https://proceedings.mlr.press/v162/sansone22a.html", "poster": "/media/PosterPDFs/ICML%202022/8d2a5f7d4afa5d0530789d3066945330_ynPmT2d.png?t=1657889442.677047", "slides": "/media/icml-2022/Slides/16689.pdf", "author_site": "EMANUELE SANSONE", "author": "Emanuele Sansone", "abstract": "We present the Local Self-Balancing sampler (LSB), a local Markov Chain Monte Carlo (MCMC) method for sampling in purely discrete domains, which is able to autonomously adapt to the target distribution and to reduce the number of target evaluations required to converge. LSB is based on (i) a parametrization of locally balanced proposals, (ii) an objective function based on mutual information and (iii) a self-balancing learning procedure, which minimises the proposed objective to update the proposal parameters. Experiments on energy-based models and Markov networks show that LSB converges using a smaller number of queries to the oracle distribution compared to recent local MCMC samplers.", "bibtex": "@InProceedings{pmlr-v162-sansone22a,\n title = \t {{LSB}: Local Self-Balancing {MCMC} in Discrete Spaces},\n author = {Sansone, Emanuele},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19205--19220},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sansone22a/sansone22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sansone22a.html},\n abstract = \t {We present the Local Self-Balancing sampler (LSB), a local Markov Chain Monte Carlo (MCMC) method for sampling in purely discrete domains, which is able to autonomously adapt to the target distribution and to reduce the number of target evaluations required to converge. LSB is based on (i) a parametrization of locally balanced proposals, (ii) an objective function based on mutual information and (iii) a self-balancing learning procedure, which minimises the proposed objective to update the proposal parameters. Experiments on energy-based models and Markov networks show that LSB converges using a smaller number of queries to the oracle distribution compared to recent local MCMC samplers.}\n}", "pdf": "https://proceedings.mlr.press/v162/sansone22a/sansone22a.pdf", "supp": "", "pdf_size": 2872530, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4624892797012274460&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, KU Leuven, Belgium", "aff_domain": "kuleuven.be", "email": "kuleuven.be", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/sansone22a.html", "aff_unique_index": "0", "aff_unique_norm": "KU Leuven", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.kuleuven.be", "aff_unique_abbr": "KU Leuven", "aff_country_unique_index": "0", "aff_country_unique": "Belgium" }, { "title": "Label Ranking through Nonparametric Regression", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16449", "id": "16449", "proceeding": "https://proceedings.mlr.press/v162/fotakis22a.html", "poster": "/media/PosterPDFs/ICML%202022/4491777b1aa8b5b32c2e8666dbe1a495.png?t=1657453868.425729", "slides": "", "author_site": "Dimitris Fotakis, Alkis Kalavasis, Eleni Psaroudaki", "author": "Dimitris Fotakis; Alkis Kalavasis; Eleni Psaroudaki", "abstract": "Label Ranking (LR) corresponds to the problem of learning a hypothesis that maps features to rankings over a finite set of labels. We adopt a nonparametric regression approach to LR and obtain theoretical performance guarantees for this fundamental practical problem. We introduce a generative model for Label Ranking, in noiseless and noisy nonparametric regression settings, and provide sample complexity bounds for learning algorithms in both cases. In the noiseless setting, we study the LR problem with full rankings and provide computationally efficient algorithms using decision trees and random forests in the high-dimensional regime. In the noisy setting, we consider the more general cases of LR with incomplete and partial rankings from a statistical viewpoint and obtain sample complexity bounds using the One-Versus-One approach of multiclass classification. Finally, we complement our theoretical contributions with experiments, aiming to understand how the input regression noise affects the observed output.", "bibtex": "@InProceedings{pmlr-v162-fotakis22a,\n title = \t {Label Ranking through Nonparametric Regression},\n author = {Fotakis, Dimitris and Kalavasis, Alkis and Psaroudaki, Eleni},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6622--6659},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fotakis22a/fotakis22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/fotakis22a.html},\n abstract = \t {Label Ranking (LR) corresponds to the problem of learning a hypothesis that maps features to rankings over a finite set of labels. We adopt a nonparametric regression approach to LR and obtain theoretical performance guarantees for this fundamental practical problem. We introduce a generative model for Label Ranking, in noiseless and noisy nonparametric regression settings, and provide sample complexity bounds for learning algorithms in both cases. In the noiseless setting, we study the LR problem with full rankings and provide computationally efficient algorithms using decision trees and random forests in the high-dimensional regime. In the noisy setting, we consider the more general cases of LR with incomplete and partial rankings from a statistical viewpoint and obtain sample complexity bounds using the One-Versus-One approach of multiclass classification. Finally, we complement our theoretical contributions with experiments, aiming to understand how the input regression noise affects the observed output.}\n}", "pdf": "https://proceedings.mlr.press/v162/fotakis22a/fotakis22a.pdf", "supp": "", "pdf_size": 711107, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16479566706175346510&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "School of Electrical and Computer Engineering, National Technical University of Athens, Athens, Greece; School of Electrical and Computer Engineering, National Technical University of Athens, Athens, Greece; School of Electrical and Computer Engineering, National Technical University of Athens, Athens, Greece", "aff_domain": "mail.ntua.gr; ; ", "email": "mail.ntua.gr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/fotakis22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "National Technical University of Athens", "aff_unique_dep": "School of Electrical and Computer Engineering", "aff_unique_url": "https://www.ntua.gr", "aff_unique_abbr": "NTUA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Athens", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Greece" }, { "title": "Label-Descriptive Patterns and Their Application to Characterizing Classification Errors", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17715", "id": "17715", "proceeding": "https://proceedings.mlr.press/v162/hedderich22a.html", "poster": "/media/PosterPDFs/ICML%202022/8a4488c177d9dc8c3da7c745c89ca214.png?t=1657491011.9220567", "slides": "", "author_site": "Michael Hedderich, Jonas Fischer, Dietrich Klakow, Jilles Vreeken", "author": "Michael A. Hedderich; Jonas Fischer; Dietrich Klakow; Jilles Vreeken", "abstract": "State-of-the-art deep learning methods achieve human-like performance on many tasks, but make errors nevertheless. Characterizing these errors in easily interpretable terms gives insight into whether a classifier is prone to making systematic errors, but also gives a way to act and improve the classifier. We propose to discover those feature-value combinations (i.e., patterns) that strongly correlate with correct resp. erroneous predictions to obtain a global and interpretable description for arbitrary classifiers. We show this is an instance of the more general label description problem, which we formulate in terms of the Minimum Description Length principle. To discover a good pattern set, we develop the efficient Premise algorithm. Through an extensive set of experiments we show it performs very well in practice on both synthetic and real-world data. Unlike existing solutions, it ably recovers ground truth patterns, even on highly imbalanced data over many features. Through two case studies on Visual Question Answering and Named Entity Recognition, we confirm that Premise gives clear and actionable insight into the systematic errors made by modern NLP classifiers.", "bibtex": "@InProceedings{pmlr-v162-hedderich22a,\n title = \t {Label-Descriptive Patterns and Their Application to Characterizing Classification Errors},\n author = {Hedderich, Michael A. and Fischer, Jonas and Klakow, Dietrich and Vreeken, Jilles},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8691--8707},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hedderich22a/hedderich22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hedderich22a.html},\n abstract = \t {State-of-the-art deep learning methods achieve human-like performance on many tasks, but make errors nevertheless. Characterizing these errors in easily interpretable terms gives insight into whether a classifier is prone to making systematic errors, but also gives a way to act and improve the classifier. We propose to discover those feature-value combinations (i.e., patterns) that strongly correlate with correct resp. erroneous predictions to obtain a global and interpretable description for arbitrary classifiers. We show this is an instance of the more general label description problem, which we formulate in terms of the Minimum Description Length principle. To discover a good pattern set, we develop the efficient Premise algorithm. Through an extensive set of experiments we show it performs very well in practice on both synthetic and real-world data. Unlike existing solutions, it ably recovers ground truth patterns, even on highly imbalanced data over many features. Through two case studies on Visual Question Answering and Named Entity Recognition, we confirm that Premise gives clear and actionable insight into the systematic errors made by modern NLP classifiers.}\n}", "pdf": "https://proceedings.mlr.press/v162/hedderich22a/hedderich22a.pdf", "supp": "", "pdf_size": 432276, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17151062876326396641&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 16, "aff": "Saarland University, Saarland Informatics Campus, Saarbr\u00fccken, Germany+Max Planck Institute for Informatics, Saarbr\u00fccken, Germany; Max Planck Institute for Informatics, Saarbr\u00fccken, Germany; Saarland University, Saarland Informatics Campus, Saarbr\u00fccken, Germany; CISPA Helmholtz Center for information Security, Saarbr\u00fccken, Germany", "aff_domain": "lsv.uni-saarland.de;mpi-inf.mpg.de; ; ", "email": "lsv.uni-saarland.de;mpi-inf.mpg.de; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/hedderich22a.html", "aff_unique_index": "0+1;1;0;2", "aff_unique_norm": "Saarland University;Max Planck Institute for Informatics;CISPA Helmholtz Center for Information Security", "aff_unique_dep": "Saarland Informatics Campus;;", "aff_unique_url": "https://www.uni-saarland.de;https://mpi-inf.mpg.de;https://www.cispa.de/", "aff_unique_abbr": "UdS;MPII;CISPA", "aff_campus_unique_index": "0+0;0;0;0", "aff_campus_unique": "Saarbr\u00fccken", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Label-Free Explainability for Unsupervised Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16183", "id": "16183", "proceeding": "https://proceedings.mlr.press/v162/crabbe22a.html", "poster": "/media/PosterPDFs/ICML%202022/1abb1e1ea5f481b589da52303b091cbb.png?t=1657445932.4065862", "slides": "", "author_site": "Jonathan Crabb\u00e9, Mihaela van der Schaar", "author": "Jonathan Crabb\u00e9; Mihaela van der Schaar", "abstract": "Unsupervised black-box models are challenging to interpret. Indeed, most existing explainability methods require labels to select which component(s) of the black-box\u2019s output to interpret. In the absence of labels, black-box outputs often are representation vectors whose components do not correspond to any meaningful quantity. Hence, choosing which component(s) to interpret in a label-free unsupervised/self-supervised setting is an important, yet unsolved problem. To bridge this gap in the literature, we introduce two crucial extensions of post-hoc explanation techniques: (1) label-free feature importance and (2) label-free example importance that respectively highlight influential features and training examples for a black-box to construct representations at inference time. We demonstrate that our extensions can be successfully implemented as simple wrappers around many existing feature and example importance methods. We illustrate the utility of our label-free explainability paradigm through a qualitative and quantitative comparison of representation spaces learned by various autoencoders trained on distinct unsupervised tasks.", "bibtex": "@InProceedings{pmlr-v162-crabbe22a,\n title = \t {Label-Free Explainability for Unsupervised Models},\n author = {Crabb{\\'e}, Jonathan and van der Schaar, Mihaela},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4391--4420},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/crabbe22a/crabbe22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/crabbe22a.html},\n abstract = \t {Unsupervised black-box models are challenging to interpret. Indeed, most existing explainability methods require labels to select which component(s) of the black-box\u2019s output to interpret. In the absence of labels, black-box outputs often are representation vectors whose components do not correspond to any meaningful quantity. Hence, choosing which component(s) to interpret in a label-free unsupervised/self-supervised setting is an important, yet unsolved problem. To bridge this gap in the literature, we introduce two crucial extensions of post-hoc explanation techniques: (1) label-free feature importance and (2) label-free example importance that respectively highlight influential features and training examples for a black-box to construct representations at inference time. We demonstrate that our extensions can be successfully implemented as simple wrappers around many existing feature and example importance methods. We illustrate the utility of our label-free explainability paradigm through a qualitative and quantitative comparison of representation spaces learned by various autoencoders trained on distinct unsupervised tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/crabbe22a/crabbe22a.pdf", "supp": "", "pdf_size": 9983324, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7922317859628289226&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of Cambridge + The Alan Turing Institute + University of California Los Angeles; University of Cambridge + The Alan Turing Institute + University of California Los Angeles", "aff_domain": "cam.ac.uk;cam.ac.uk", "email": "cam.ac.uk;cam.ac.uk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/crabbe22a.html", "aff_unique_index": "0+1+2;0+1+2", "aff_unique_norm": "University of Cambridge;Alan Turing Institute;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cam.ac.uk;https://www.turing.ac.uk;https://www.ucla.edu", "aff_unique_abbr": "Cambridge;ATI;UCLA", "aff_campus_unique_index": "0+2;0+2", "aff_campus_unique": "Cambridge;;Los Angeles", "aff_country_unique_index": "0+0+1;0+0+1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Lagrangian Method for Q-Function Learning (with Applications to Machine Translation)", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17953", "id": "17953", "proceeding": "https://proceedings.mlr.press/v162/bojun22a.html", "poster": "/media/PosterPDFs/ICML%202022/536eecee295b92db6b32194e269541f8_MC9q6K3.png?t=1657206719.8235865", "slides": "", "author": "Huang Bojun", "abstract": "This paper discusses a new approach to the fundamental problem of learning optimal Q-functions. In this approach, optimal Q-functions are formulated as saddle points of a nonlinear Lagrangian function derived from the classic Bellman optimality equation. The paper shows that the Lagrangian enjoys strong duality, in spite of its nonlinearity, which paves the way to a general Lagrangian method to Q-function learning. As a demonstration, the paper develops an imitation learning algorithm based on the duality theory, and applies the algorithm to a state-of-the-art machine translation benchmark. The paper then turns to demonstrate a symmetry breaking phenomenon regarding the optimality of the Lagrangian saddle points, which justifies a largely overlooked direction in developing the Lagrangian method.", "bibtex": "@InProceedings{pmlr-v162-bojun22a,\n title = \t {Lagrangian Method for Q-Function Learning (with Applications to Machine Translation)},\n author = {Bojun, Huang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2129--2159},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bojun22a/bojun22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bojun22a.html},\n abstract = \t {This paper discusses a new approach to the fundamental problem of learning optimal Q-functions. In this approach, optimal Q-functions are formulated as saddle points of a nonlinear Lagrangian function derived from the classic Bellman optimality equation. The paper shows that the Lagrangian enjoys strong duality, in spite of its nonlinearity, which paves the way to a general Lagrangian method to Q-function learning. As a demonstration, the paper develops an imitation learning algorithm based on the duality theory, and applies the algorithm to a state-of-the-art machine translation benchmark. The paper then turns to demonstrate a symmetry breaking phenomenon regarding the optimality of the Lagrangian saddle points, which justifies a largely overlooked direction in developing the Lagrangian method.}\n}", "pdf": "https://proceedings.mlr.press/v162/bojun22a/bojun22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/bojun22a-supp.zip", "pdf_size": 1045635, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1469409701550340782&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Rakuten Institute of Technology, Rakuten Group Inc., Japan", "aff_domain": "gmail.com", "email": "gmail.com", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/bojun22a.html", "aff_unique_index": "0", "aff_unique_norm": "Rakuten Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://rit.rakuten.com", "aff_unique_abbr": "RIT", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "title": "Langevin Monte Carlo for Contextual Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18145", "id": "18145", "proceeding": "https://proceedings.mlr.press/v162/xu22p.html", "poster": "/media/PosterPDFs/ICML%202022/561918f13a2832726ec7f2e16ecd76c1_Yq4cWtn.png?t=1658246614.4239206", "slides": "", "author_site": "Pan Xu, Hongkai Zheng, Eric Mazumdar, Kamyar Azizzadenesheli, Animashree Anandkumar", "author": "Pan Xu; Hongkai Zheng; Eric V Mazumdar; Kamyar Azizzadenesheli; Animashree Anandkumar", "abstract": "We study the efficiency of Thompson sampling for contextual bandits. Existing Thompson sampling-based algorithms need to construct a Laplace approximation (i.e., a Gaussian distribution) of the posterior distribution, which is inefficient to sample in high dimensional applications for general covariance matrices. Moreover, the Gaussian approximation may not be a good surrogate for the posterior distribution for general reward generating functions. We propose an efficient posterior sampling algorithm, viz., Langevin Monte Carlo Thompson Sampling (LMC-TS), that uses Markov Chain Monte Carlo (MCMC) methods to directly sample from the posterior distribution in contextual bandits. Our method is computationally efficient since it only needs to perform noisy gradient descent updates without constructing the Laplace approximation of the posterior distribution. We prove that the proposed algorithm achieves the same sublinear regret bound as the best Thompson sampling algorithms for a special case of contextual bandits, viz., linear contextual bandits. We conduct experiments on both synthetic data and real-world datasets on different contextual bandit models, which demonstrates that directly sampling from the posterior is both computationally efficient and competitive in performance.", "bibtex": "@InProceedings{pmlr-v162-xu22p,\n title = \t {{L}angevin {M}onte {C}arlo for Contextual Bandits},\n author = {Xu, Pan and Zheng, Hongkai and Mazumdar, Eric V and Azizzadenesheli, Kamyar and Anandkumar, Animashree},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24830--24850},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22p/xu22p.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22p.html},\n abstract = \t {We study the efficiency of Thompson sampling for contextual bandits. Existing Thompson sampling-based algorithms need to construct a Laplace approximation (i.e., a Gaussian distribution) of the posterior distribution, which is inefficient to sample in high dimensional applications for general covariance matrices. Moreover, the Gaussian approximation may not be a good surrogate for the posterior distribution for general reward generating functions. We propose an efficient posterior sampling algorithm, viz., Langevin Monte Carlo Thompson Sampling (LMC-TS), that uses Markov Chain Monte Carlo (MCMC) methods to directly sample from the posterior distribution in contextual bandits. Our method is computationally efficient since it only needs to perform noisy gradient descent updates without constructing the Laplace approximation of the posterior distribution. We prove that the proposed algorithm achieves the same sublinear regret bound as the best Thompson sampling algorithms for a special case of contextual bandits, viz., linear contextual bandits. We conduct experiments on both synthetic data and real-world datasets on different contextual bandit models, which demonstrates that directly sampling from the posterior is both computationally efficient and competitive in performance.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22p/xu22p.pdf", "supp": "", "pdf_size": 5629567, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17947059462373456392&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computing and Mathematical Sciences, California Institute of Technology, Pasadena, CA, USA; Department of Computing and Mathematical Sciences, California Institute of Technology, Pasadena, CA, USA; Department of Computing and Mathematical Sciences, California Institute of Technology, Pasadena, CA, USA; Department of Computer Science, Purdue University, West Lafayette, IN, USA; Department of Computing and Mathematical Sciences, California Institute of Technology, Pasadena, CA, USA", "aff_domain": "duke.edu; ; ; ; ", "email": "duke.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/xu22p.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "California Institute of Technology;Purdue University", "aff_unique_dep": "Department of Computing and Mathematical Sciences;Department of Computer Science", "aff_unique_url": "https://www.caltech.edu;https://www.purdue.edu", "aff_unique_abbr": "Caltech;Purdue", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Pasadena;West Lafayette", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Language Models as Zero-Shot Planners: Extracting Actionable Knowledge for Embodied Agents", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17945", "id": "17945", "proceeding": "https://proceedings.mlr.press/v162/huang22a.html", "poster": "", "slides": "", "author_site": "Wenlong Huang, Pieter Abbeel, Deepak Pathak, Igor Mordatch", "author": "Wenlong Huang; Pieter Abbeel; Deepak Pathak; Igor Mordatch", "abstract": "Can world knowledge learned by large language models (LLMs) be used to act in interactive environments? In this paper, we investigate the possibility of grounding high-level tasks, expressed in natural language (e.g. \u201cmake breakfast\u201d), to a chosen set of actionable steps (e.g. \u201copen fridge\u201d). While prior work focused on learning from explicit step-by-step examples of how to act, we surprisingly find that if pre-trained LMs are large enough and prompted appropriately, they can effectively decompose high-level tasks into mid-level plans without any further training. However, the plans produced naively by LLMs often cannot map precisely to admissible actions. We propose a procedure that conditions on existing demonstrations and semantically translates the plans to admissible actions. Our evaluation in the recent VirtualHome environment shows that the resulting method substantially improves executability over the LLM baseline. The conducted human evaluation reveals a trade-off between executability and correctness but shows a promising sign towards extracting actionable knowledge from language models.", "bibtex": "@InProceedings{pmlr-v162-huang22a,\n title = \t {Language Models as Zero-Shot Planners: Extracting Actionable Knowledge for Embodied Agents},\n author = {Huang, Wenlong and Abbeel, Pieter and Pathak, Deepak and Mordatch, Igor},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9118--9147},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22a/huang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22a.html},\n abstract = \t {Can world knowledge learned by large language models (LLMs) be used to act in interactive environments? In this paper, we investigate the possibility of grounding high-level tasks, expressed in natural language (e.g. \u201cmake breakfast\u201d), to a chosen set of actionable steps (e.g. \u201copen fridge\u201d). While prior work focused on learning from explicit step-by-step examples of how to act, we surprisingly find that if pre-trained LMs are large enough and prompted appropriately, they can effectively decompose high-level tasks into mid-level plans without any further training. However, the plans produced naively by LLMs often cannot map precisely to admissible actions. We propose a procedure that conditions on existing demonstrations and semantically translates the plans to admissible actions. Our evaluation in the recent VirtualHome environment shows that the resulting method substantially improves executability over the LLM baseline. The conducted human evaluation reveals a trade-off between executability and correctness but shows a promising sign towards extracting actionable knowledge from language models.}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22a/huang22a.pdf", "supp": "", "pdf_size": 2189588, "gs_citation": 1280, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11998123682359381476&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "University of California, Berkeley; University of California, Berkeley; Carnegie Mellon University; Google", "aff_domain": "berkeley.edu; ; ; ", "email": "berkeley.edu; ; ; ", "github": "", "project": "https://huangwl18.github.io/language-planner/", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/huang22a.html", "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of California, Berkeley;Carnegie Mellon University;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.berkeley.edu;https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;CMU;Google", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Large Batch Experience Replay", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15971", "id": "15971", "proceeding": "https://proceedings.mlr.press/v162/lahire22a.html", "poster": "/media/PosterPDFs/ICML%202022/3806734b256c27e41ec2c6bffa26d9e7.png?t=1657518324.0016177", "slides": "/media/icml-2022/Slides/15971.pdf", "author_site": "Thibault Lahire, Matthieu Geist, Emmanuel Rachelson", "author": "Thibault Lahire; Matthieu Geist; Emmanuel Rachelson", "abstract": "Several algorithms have been proposed to sample non-uniformly the replay buffer of deep Reinforcement Learning (RL) agents to speed-up learning, but very few theoretical foundations of these sampling schemes have been provided. Among others, Prioritized Experience Replay appears as a hyperparameter sensitive heuristic, even though it can provide good performance. In this work, we cast the replay buffer sampling problem as an importance sampling one for estimating the gradient. This allows deriving the theoretically optimal sampling distribution, yielding the best theoretical convergence speed. Elaborating on the knowledge of the ideal sampling scheme, we exhibit new theoretical foundations of Prioritized Experience Replay. The optimal sampling distribution being intractable, we make several approximations providing good results in practice and introduce, among others, LaBER (Large Batch Experience Replay), an easy-to-code and efficient method for sampling the replay buffer. LaBER, which can be combined with Deep Q-Networks, distributional RL agents or actor-critic methods, yields improved performance over a diverse range of Atari games and PyBullet environments, compared to the base agent it is implemented on and to other prioritization schemes.", "bibtex": "@InProceedings{pmlr-v162-lahire22a,\n title = \t {Large Batch Experience Replay},\n author = {Lahire, Thibault and Geist, Matthieu and Rachelson, Emmanuel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11790--11813},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lahire22a/lahire22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lahire22a.html},\n abstract = \t {Several algorithms have been proposed to sample non-uniformly the replay buffer of deep Reinforcement Learning (RL) agents to speed-up learning, but very few theoretical foundations of these sampling schemes have been provided. Among others, Prioritized Experience Replay appears as a hyperparameter sensitive heuristic, even though it can provide good performance. In this work, we cast the replay buffer sampling problem as an importance sampling one for estimating the gradient. This allows deriving the theoretically optimal sampling distribution, yielding the best theoretical convergence speed. Elaborating on the knowledge of the ideal sampling scheme, we exhibit new theoretical foundations of Prioritized Experience Replay. The optimal sampling distribution being intractable, we make several approximations providing good results in practice and introduce, among others, LaBER (Large Batch Experience Replay), an easy-to-code and efficient method for sampling the replay buffer. LaBER, which can be combined with Deep Q-Networks, distributional RL agents or actor-critic methods, yields improved performance over a diverse range of Atari games and PyBullet environments, compared to the base agent it is implemented on and to other prioritization schemes.}\n}", "pdf": "https://proceedings.mlr.press/v162/lahire22a/lahire22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/lahire22a-supp.zip", "pdf_size": 12864333, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7195743594836265223&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "ISAE-SUPAERO, Universit \u00b4e de Toulouse, France; Google Research, Brain Team; ISAE-SUPAERO, Universit \u00b4e de Toulouse, France", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/lahire22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "ISAE-SUPAERO;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.isae-supaero.fr;https://research.google", "aff_unique_abbr": "ISAE-SUPAERO;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0", "aff_country_unique": "France;United States" }, { "title": "Large-Scale Graph Neural Architecture Search", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18125", "id": "18125", "proceeding": "https://proceedings.mlr.press/v162/guan22d.html", "poster": "", "slides": "", "author_site": "Chaoyu Guan, Xin Wang, Hong Chen, Ziwei Zhang, Wenwu Zhu", "author": "Chaoyu Guan; Xin Wang; Hong Chen; Ziwei Zhang; Wenwu Zhu", "abstract": "Graph Neural Architecture Search (GNAS) has become a powerful method in automatically discovering suitable Graph Neural Network (GNN) architectures for different tasks. However, existing approaches fail to handle large-scale graphs because current performance estimation strategies in GNAS are computationally expensive for large-scale graphs and suffer from consistency collapse issues. To tackle these problems, we propose the Graph ArchitectUre Search at Scale (GAUSS) method that can handle large-scale graphs by designing an efficient light-weight supernet and the joint architecture-graph sampling. In particular, a graph sampling-based single-path one-shot supernet is proposed to reduce the computation burden. To address the consistency collapse issues, we further explicitly consider the joint architecture-graph sampling through a novel architecture peer learning mechanism on the sampled sub-graphs and an architecture importance sampling algorithm. Our proposed framework is able to smooth the highly non-convex optimization objective and stabilize the architecture sampling process. We provide theoretical analyses on GAUSS and empirically evaluate it on five datasets whose vertex sizes range from 10^4 to 10^8. The experimental results demonstrate substantial improvements of GAUSS over other GNAS baselines on all datasets. To the best of our knowledge, the proposed GAUSS method is the first graph neural architecture search framework that can handle graphs with billions of edges within 1 GPU day.", "bibtex": "@InProceedings{pmlr-v162-guan22d,\n title = \t {Large-Scale Graph Neural Architecture Search},\n author = {Guan, Chaoyu and Wang, Xin and Chen, Hong and Zhang, Ziwei and Zhu, Wenwu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7968--7981},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guan22d/guan22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/guan22d.html},\n abstract = \t {Graph Neural Architecture Search (GNAS) has become a powerful method in automatically discovering suitable Graph Neural Network (GNN) architectures for different tasks. However, existing approaches fail to handle large-scale graphs because current performance estimation strategies in GNAS are computationally expensive for large-scale graphs and suffer from consistency collapse issues. To tackle these problems, we propose the Graph ArchitectUre Search at Scale (GAUSS) method that can handle large-scale graphs by designing an efficient light-weight supernet and the joint architecture-graph sampling. In particular, a graph sampling-based single-path one-shot supernet is proposed to reduce the computation burden. To address the consistency collapse issues, we further explicitly consider the joint architecture-graph sampling through a novel architecture peer learning mechanism on the sampled sub-graphs and an architecture importance sampling algorithm. Our proposed framework is able to smooth the highly non-convex optimization objective and stabilize the architecture sampling process. We provide theoretical analyses on GAUSS and empirically evaluate it on five datasets whose vertex sizes range from 10^4 to 10^8. The experimental results demonstrate substantial improvements of GAUSS over other GNAS baselines on all datasets. To the best of our knowledge, the proposed GAUSS method is the first graph neural architecture search framework that can handle graphs with billions of edges within 1 GPU day.}\n}", "pdf": "https://proceedings.mlr.press/v162/guan22d/guan22d.pdf", "supp": "", "pdf_size": 566823, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12265471467400084663&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Media and Network Lab, Department of Computer Science and Technology, Tsinghua University; Media and Network Lab, Department of Computer Science and Technology, Tsinghua University; Media and Network Lab, Department of Computer Science and Technology, Tsinghua University; Media and Network Lab, Department of Computer Science and Technology, Tsinghua University; Media and Network Lab, Department of Computer Science and Technology, Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "github": "https://www.github.com/THUMNLab/GAUSS", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/guan22d.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Department of Computer Science and Technology", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Large-scale Stochastic Optimization of NDCG Surrogates for Deep Learning with Provable Convergence", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17311", "id": "17311", "proceeding": "https://proceedings.mlr.press/v162/qiu22a.html", "poster": "/media/PosterPDFs/ICML%202022/cc9657884708170e160c8372d92f3535_huj3Wzn.png?t=1657862075.9147248", "slides": "", "author_site": "Zi-Hao Qiu, Quanqi Hu, Yongjian Zhong, Lijun Zhang, Tianbao Yang", "author": "Zi-Hao Qiu; Quanqi Hu; Yongjian Zhong; Lijun Zhang; Tianbao Yang", "abstract": "NDCG, namely Normalized Discounted Cumulative Gain, is a widely used ranking metric in information retrieval and machine learning. However, efficient and provable stochastic methods for maximizing NDCG are still lacking, especially for deep models. In this paper, we propose a principled approach to optimize NDCG and its top-$K$ variant. First, we formulate a novel compositional optimization problem for optimizing the NDCG surrogate, and a novel bilevel compositional optimization problem for optimizing the top-$K$ NDCG surrogate. Then, we develop efficient stochastic algorithms with provable convergence guarantees for the non-convex objectives. Different from existing NDCG optimization methods, the per-iteration complexity of our algorithms scales with the mini-batch size instead of the number of total items. To improve the effectiveness for deep learning, we further propose practical strategies by using initial warm-up and stop gradient operator. Experimental results on multiple datasets demonstrate that our methods outperform prior ranking approaches in terms of NDCG. To the best of our knowledge, this is the first time that stochastic algorithms are proposed to optimize NDCG with a provable convergence guarantee. Our proposed methods are implemented in the LibAUC library at https://libauc.org.", "bibtex": "@InProceedings{pmlr-v162-qiu22a,\n title = \t {Large-scale Stochastic Optimization of {NDCG} Surrogates for Deep Learning with Provable Convergence},\n author = {Qiu, Zi-Hao and Hu, Quanqi and Zhong, Yongjian and Zhang, Lijun and Yang, Tianbao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18122--18152},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qiu22a/qiu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/qiu22a.html},\n abstract = \t {NDCG, namely Normalized Discounted Cumulative Gain, is a widely used ranking metric in information retrieval and machine learning. However, efficient and provable stochastic methods for maximizing NDCG are still lacking, especially for deep models. In this paper, we propose a principled approach to optimize NDCG and its top-$K$ variant. First, we formulate a novel compositional optimization problem for optimizing the NDCG surrogate, and a novel bilevel compositional optimization problem for optimizing the top-$K$ NDCG surrogate. Then, we develop efficient stochastic algorithms with provable convergence guarantees for the non-convex objectives. Different from existing NDCG optimization methods, the per-iteration complexity of our algorithms scales with the mini-batch size instead of the number of total items. To improve the effectiveness for deep learning, we further propose practical strategies by using initial warm-up and stop gradient operator. Experimental results on multiple datasets demonstrate that our methods outperform prior ranking approaches in terms of NDCG. To the best of our knowledge, this is the first time that stochastic algorithms are proposed to optimize NDCG with a provable convergence guarantee. Our proposed methods are implemented in the LibAUC library at https://libauc.org.}\n}", "pdf": "https://proceedings.mlr.press/v162/qiu22a/qiu22a.pdf", "supp": "", "pdf_size": 13916380, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9377138316635213561&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China+the University of Iowa, Iowa City, USA; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China+the University of Iowa, Iowa City, USA; the University of Iowa, Iowa City, USA; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China; the University of Iowa, Iowa City, USA", "aff_domain": "lamda.nju.edu.cn;uiowa.edu; ; ; ", "email": "lamda.nju.edu.cn;uiowa.edu; ; ; ", "github": "", "project": "https://libauc.org/", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/qiu22a.html", "aff_unique_index": "0+1;0+1;1;0;1", "aff_unique_norm": "Nanjing University;University of Iowa", "aff_unique_dep": "National Key Laboratory for Novel Software Technology;", "aff_unique_url": "http://www.nju.edu.cn;https://www.uiowa.edu", "aff_unique_abbr": "Nanjing U;UIowa", "aff_campus_unique_index": "0+1;0+1;1;0;1", "aff_campus_unique": "Nanjing;Iowa City", "aff_country_unique_index": "0+1;0+1;1;0;1", "aff_country_unique": "China;United States" }, { "title": "Last Iterate Risk Bounds of SGD with Decaying Stepsize for Overparameterized Linear Regression", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17037", "id": "17037", "proceeding": "https://proceedings.mlr.press/v162/wu22p.html", "poster": "/media/PosterPDFs/ICML%202022/a0205b87490c847182672e8d371e9948_TrKklbx.png?t=1658083426.8797288", "slides": "/media/icml-2022/Slides/17037.pdf", "author_site": "Jingfeng Wu, Difan Zou, Vladimir Braverman, Quanquan Gu, Sham Kakade", "author": "Jingfeng Wu; Difan Zou; Vladimir Braverman; Quanquan Gu; Sham Kakade", "abstract": "Stochastic gradient descent (SGD) has been shown to generalize well in many deep learning applications. In practice, one often runs SGD with a geometrically decaying stepsize, i.e., a constant initial stepsize followed by multiple geometric stepsize decay, and uses the last iterate as the output. This kind of SGD is known to be nearly minimax optimal for classical finite-dimensional linear regression problems (Ge et al., 2019). However, a sharp analysis for the last iterate of SGD in the overparameterized setting is still open. In this paper, we provide a problem-dependent analysis on the last iterate risk bounds of SGD with decaying stepsize, for (overparameterized) linear regression problems. In particular, for last iterate SGD with (tail) geometrically decaying stepsize, we prove nearly matching upper and lower bounds on the excess risk. Moreover, we provide an excess risk lower bound for last iterate SGD with polynomially decaying stepsize and demonstrate the advantage of geometrically decaying stepsize in an instance-wise manner, which complements the minimax rate comparison made in prior work.", "bibtex": "@InProceedings{pmlr-v162-wu22p,\n title = \t {Last Iterate Risk Bounds of {SGD} with Decaying Stepsize for Overparameterized Linear Regression},\n author = {Wu, Jingfeng and Zou, Difan and Braverman, Vladimir and Gu, Quanquan and Kakade, Sham},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24280--24314},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22p/wu22p.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22p.html},\n abstract = \t {Stochastic gradient descent (SGD) has been shown to generalize well in many deep learning applications. In practice, one often runs SGD with a geometrically decaying stepsize, i.e., a constant initial stepsize followed by multiple geometric stepsize decay, and uses the last iterate as the output. This kind of SGD is known to be nearly minimax optimal for classical finite-dimensional linear regression problems (Ge et al., 2019). However, a sharp analysis for the last iterate of SGD in the overparameterized setting is still open. In this paper, we provide a problem-dependent analysis on the last iterate risk bounds of SGD with decaying stepsize, for (overparameterized) linear regression problems. In particular, for last iterate SGD with (tail) geometrically decaying stepsize, we prove nearly matching upper and lower bounds on the excess risk. Moreover, we provide an excess risk lower bound for last iterate SGD with polynomially decaying stepsize and demonstrate the advantage of geometrically decaying stepsize in an instance-wise manner, which complements the minimax rate comparison made in prior work.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22p/wu22p.pdf", "supp": "", "pdf_size": 699402, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10108078400326624045&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "aff": "Department of Computer Science, Johns Hopkins University, Baltimore, MD 21218, USA+Department of Computer Science, University of California, Los Angeles, CA 90095, USA; Department of Computer Science, University of California, Los Angeles, CA 90095, USA; Department of Computer Science, Johns Hopkins University, Baltimore, MD 21218, USA; Department of Computer Science and Department of Statistics, Harvard University, Cambridge, MA 02138, USA; Department of Computer Science and Department of Statistics, Harvard University, Cambridge, MA 02138, USA", "aff_domain": "cs.jhu.edu;cs.ucla.edu;cs.jhu.edu;cs.ucla.edu;seas.harvard.edu", "email": "cs.jhu.edu;cs.ucla.edu;cs.jhu.edu;cs.ucla.edu;seas.harvard.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/wu22p.html", "aff_unique_index": "0+1;1;0;2;2", "aff_unique_norm": "Johns Hopkins University;University of California, Los Angeles;Harvard University", "aff_unique_dep": "Department of Computer Science;Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.jhu.edu;https://www.ucla.edu;https://www.harvard.edu", "aff_unique_abbr": "JHU;UCLA;Harvard", "aff_campus_unique_index": "0+1;1;0;2;2", "aff_campus_unique": "Baltimore;Los Angeles;Cambridge", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Latent Diffusion Energy-Based Model for Interpretable Text Modelling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17721", "id": "17721", "proceeding": "https://proceedings.mlr.press/v162/yu22h.html", "poster": "/media/PosterPDFs/ICML%202022/f60bb6bb4c96d4df93c51bd69dcc15a0.png?t=1657553526.2145302", "slides": "", "author_site": "Peiyu Yu, Sirui Xie, Xiaojian Ma, Baoxiong Jia, Bo Pang, Ruiqi Gao, Yixin Zhu, Song-Chun Zhu, Ying Nian Wu", "author": "Peiyu Yu; Sirui Xie; Xiaojian Ma; Baoxiong Jia; Bo Pang; Ruiqi Gao; Yixin Zhu; Song-Chun Zhu; Ying Nian Wu", "abstract": "Latent space Energy-Based Models (EBMs), also known as energy-based priors, have drawn growing interests in generative modeling. Fueled by its flexibility in the formulation and strong modeling power of the latent space, recent works built upon it have made interesting attempts aiming at the interpretability of text modeling. However, latent space EBMs also inherit some flaws from EBMs in data space; the degenerate MCMC sampling quality in practice can lead to poor generation quality and instability in training, especially on data with complex latent structures. Inspired by the recent efforts that leverage diffusion recovery likelihood learning as a cure for the sampling issue, we introduce a novel symbiosis between the diffusion models and latent space EBMs in a variational learning framework, coined as the latent diffusion energy-based model. We develop a geometric clustering-based regularization jointly with the information bottleneck to further improve the quality of the learned latent space. Experiments on several challenging tasks demonstrate the superior performance of our model on interpretable text modeling over strong counterparts.", "bibtex": "@InProceedings{pmlr-v162-yu22h,\n title = \t {Latent Diffusion Energy-Based Model for Interpretable Text Modelling},\n author = {Yu, Peiyu and Xie, Sirui and Ma, Xiaojian and Jia, Baoxiong and Pang, Bo and Gao, Ruiqi and Zhu, Yixin and Zhu, Song-Chun and Wu, Ying Nian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25702--25720},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yu22h/yu22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/yu22h.html},\n abstract = \t {Latent space Energy-Based Models (EBMs), also known as energy-based priors, have drawn growing interests in generative modeling. Fueled by its flexibility in the formulation and strong modeling power of the latent space, recent works built upon it have made interesting attempts aiming at the interpretability of text modeling. However, latent space EBMs also inherit some flaws from EBMs in data space; the degenerate MCMC sampling quality in practice can lead to poor generation quality and instability in training, especially on data with complex latent structures. Inspired by the recent efforts that leverage diffusion recovery likelihood learning as a cure for the sampling issue, we introduce a novel symbiosis between the diffusion models and latent space EBMs in a variational learning framework, coined as the latent diffusion energy-based model. We develop a geometric clustering-based regularization jointly with the information bottleneck to further improve the quality of the learned latent space. Experiments on several challenging tasks demonstrate the superior performance of our model on interpretable text modeling over strong counterparts.}\n}", "pdf": "https://proceedings.mlr.press/v162/yu22h/yu22h.pdf", "supp": "", "pdf_size": 14636986, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16836080841508277390&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, UCLA, USA+Beijing Institute for General Artificial Intelligence, China; Department of Computer Science, UCLA, USA; Department of Computer Science, UCLA, USA+Beijing Institute for General Artificial Intelligence, China; Department of Computer Science, UCLA, USA+Beijing Institute for General Artificial Intelligence, China; Salesforce Research, USA; Google Brain, USA; Institute for Artificial Intelligence, Peking University, China+School of Artificial Intelligence, Peking University, China; Department of Computer Science, UCLA, USA+Beijing Institute for General Artificial Intelligence, China+Institute for Artificial Intelligence, Peking University, China+School of Artificial Intelligence, Peking University, China+Department of Statistics, UCLA, USA+Department of Automation, Tsinghua University, China; Department of Statistics, UCLA, USA", "aff_domain": "cs.ucla.edu; ; ; ; ; ; ;cs.ucla.edu; ", "email": "cs.ucla.edu; ; ; ; ; ; ;cs.ucla.edu; ", "github": "https://github.com/yuPeiyu98/LDEBM", "project": "", "author_num": 9, "oa": "https://proceedings.mlr.press/v162/yu22h.html", "aff_unique_index": "0+1;0;0+1;0+1;2;3;4+4;0+1+4+4+0+5;0", "aff_unique_norm": "University of California, Los Angeles;Beijing Institute for General Artificial Intelligence;Salesforce Research;Google;Peking University;Tsinghua University", "aff_unique_dep": "Department of Computer Science;;;Google Brain;Institute for Artificial Intelligence;Department of Automation", "aff_unique_url": "https://www.ucla.edu;http://www.bigaiai.cn;https://research.salesforce.com;https://brain.google.com;http://www.pku.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "UCLA;BIGAI;Salesforce;Google Brain;PKU;Tsinghua", "aff_campus_unique_index": "0;0;0;0;2;;0+0;0", "aff_campus_unique": "Los Angeles;;Mountain View", "aff_country_unique_index": "0+1;0;0+1;0+1;0;0;1+1;0+1+1+1+0+1;0", "aff_country_unique": "United States;China" }, { "title": "Latent Outlier Exposure for Anomaly Detection with Contaminated Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17901", "id": "17901", "proceeding": "https://proceedings.mlr.press/v162/qiu22b.html", "poster": "/media/PosterPDFs/ICML%202022/84a955d5ff75f508ec01007bc2b9b301.png?t=1657219470.5320225", "slides": "", "author_site": "Chen Qiu, Aodong Li, Marius Kloft, Maja Rudolph, Stephan Mandt", "author": "Chen Qiu; Aodong Li; Marius Kloft; Maja Rudolph; Stephan Mandt", "abstract": "Anomaly detection aims at identifying data points that show systematic deviations from the majority of data in an unlabeled dataset. A common assumption is that clean training data (free of anomalies) is available, which is often violated in practice. We propose a strategy for training an anomaly detector in the presence of unlabeled anomalies that is compatible with a broad class of models. The idea is to jointly infer binary labels to each datum (normal vs. anomalous) while updating the model parameters. Inspired by outlier exposure (Hendrycks et al., 2018) that considers synthetically created, labeled anomalies, we thereby use a combination of two losses that share parameters: one for the normal and one for the anomalous data. We then iteratively proceed with block coordinate updates on the parameters and the most likely (latent) labels. Our experiments with several backbone models on three image datasets, 30 tabular data sets, and a video anomaly detection benchmark showed consistent and significant improvements over the baselines.", "bibtex": "@InProceedings{pmlr-v162-qiu22b,\n title = \t {Latent Outlier Exposure for Anomaly Detection with Contaminated Data},\n author = {Qiu, Chen and Li, Aodong and Kloft, Marius and Rudolph, Maja and Mandt, Stephan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18153--18167},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qiu22b/qiu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/qiu22b.html},\n abstract = \t {Anomaly detection aims at identifying data points that show systematic deviations from the majority of data in an unlabeled dataset. A common assumption is that clean training data (free of anomalies) is available, which is often violated in practice. We propose a strategy for training an anomaly detector in the presence of unlabeled anomalies that is compatible with a broad class of models. The idea is to jointly infer binary labels to each datum (normal vs. anomalous) while updating the model parameters. Inspired by outlier exposure (Hendrycks et al., 2018) that considers synthetically created, labeled anomalies, we thereby use a combination of two losses that share parameters: one for the normal and one for the anomalous data. We then iteratively proceed with block coordinate updates on the parameters and the most likely (latent) labels. Our experiments with several backbone models on three image datasets, 30 tabular data sets, and a video anomaly detection benchmark showed consistent and significant improvements over the baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/qiu22b/qiu22b.pdf", "supp": "", "pdf_size": 4233533, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3679566789459312121&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Bosch Center for Artificial Intelligence+TU Kaiserslautern; TU Kaiserslautern; TU Kaiserslautern; Bosch Center for Artificial Intelligence; UC Irvine", "aff_domain": "de.bosch.com; ; ; ;uci.edu", "email": "de.bosch.com; ; ; ;uci.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/qiu22b.html", "aff_unique_index": "0+1;1;1;0;2", "aff_unique_norm": "Bosch Center for Artificial Intelligence;Technische Universit\u00e4t Kaiserslautern;University of California, Irvine", "aff_unique_dep": "Center for Artificial Intelligence;;", "aff_unique_url": "https://www.bosch-ai.com;https://www.tu-kl.de;https://www.uci.edu", "aff_unique_abbr": "BCAI;TU Kaiserslautern;UCI", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Irvine", "aff_country_unique_index": "0+0;0;0;0;1", "aff_country_unique": "Germany;United States" }, { "title": "Lazy Estimation of Variable Importance for Large Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18087", "id": "18087", "proceeding": "https://proceedings.mlr.press/v162/gao22h.html", "poster": "", "slides": "", "author_site": "Yue Gao, Abby Stevens, Garvesh Raskutti, Rebecca Willett", "author": "Yue Gao; Abby Stevens; Garvesh Raskutti; Rebecca Willett", "abstract": "As opaque predictive models increasingly impact many areas of modern life, interest in quantifying the importance of a given input variable for making a specific prediction has grown. Recently, there has been a proliferation of model-agnostic methods to measure variable importance (VI) that analyze the difference in predictive power between a full model trained on all variables and a reduced model that excludes the variable(s) of interest. A bottleneck common to these methods is the estimation of the reduced model for each variable (or subset of variables), which is an expensive process that often does not come with theoretical guarantees. In this work, we propose a fast and flexible method for approximating the reduced model with important inferential guarantees. We replace the need for fully retraining a wide neural network by a linearization initialized at the full model parameters. By adding a ridge-like penalty to make the problem convex, we prove that when the ridge penalty parameter is sufficiently large, our method estimates the variable importance measure with an error rate of O(1/n) where n is the number of training samples. We also show that our estimator is asymptotically normal, enabling us to provide confidence bounds for the VI estimates. We demonstrate through simulations that our method is fast and accurate under several data-generating regimes, and we demonstrate its real-world applicability on a seasonal climate forecasting example.", "bibtex": "@InProceedings{pmlr-v162-gao22h,\n title = \t {Lazy Estimation of Variable Importance for Large Neural Networks},\n author = {Gao, Yue and Stevens, Abby and Raskutti, Garvesh and Willett, Rebecca},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7122--7143},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gao22h/gao22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/gao22h.html},\n abstract = \t {As opaque predictive models increasingly impact many areas of modern life, interest in quantifying the importance of a given input variable for making a specific prediction has grown. Recently, there has been a proliferation of model-agnostic methods to measure variable importance (VI) that analyze the difference in predictive power between a full model trained on all variables and a reduced model that excludes the variable(s) of interest. A bottleneck common to these methods is the estimation of the reduced model for each variable (or subset of variables), which is an expensive process that often does not come with theoretical guarantees. In this work, we propose a fast and flexible method for approximating the reduced model with important inferential guarantees. We replace the need for fully retraining a wide neural network by a linearization initialized at the full model parameters. By adding a ridge-like penalty to make the problem convex, we prove that when the ridge penalty parameter is sufficiently large, our method estimates the variable importance measure with an error rate of O(1/n) where n is the number of training samples. We also show that our estimator is asymptotically normal, enabling us to provide confidence bounds for the VI estimates. We demonstrate through simulations that our method is fast and accurate under several data-generating regimes, and we demonstrate its real-world applicability on a seasonal climate forecasting example.}\n}", "pdf": "https://proceedings.mlr.press/v162/gao22h/gao22h.pdf", "supp": "", "pdf_size": 3067960, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11646154414177168250&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Statistics, University of Wisconsin, Madison; Department of Statistics, University of Chicago; Department of Statistics, University of Wisconsin, Madison + Department of Computer Science, University of Chicago; Department of Statistics, University of Chicago + Department of Computer Science, University of Chicago", "aff_domain": "wisc.edu;uchicago.edu;wisc.edu;uchicago.edu", "email": "wisc.edu;uchicago.edu;wisc.edu;uchicago.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/gao22h.html", "aff_unique_index": "0;1;0+1;1+1", "aff_unique_norm": "University of Wisconsin-Madison;University of Chicago", "aff_unique_dep": "Department of Statistics;Department of Statistics", "aff_unique_url": "https://www.wisc.edu;https://www.uchicago.edu", "aff_unique_abbr": "UW-Madison;UChicago", "aff_campus_unique_index": "0;0;", "aff_campus_unique": "Madison;", "aff_country_unique_index": "0;0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "LeNSE: Learning To Navigate Subgraph Embeddings for Large-Scale Combinatorial Optimisation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18005", "id": "18005", "proceeding": "https://proceedings.mlr.press/v162/ireland22a.html", "poster": "/media/PosterPDFs/ICML%202022/177db6acfe388526a4c7bff88e1feb15.png?t=1657530539.693988", "slides": "", "author_site": "David Ireland, Giovanni Montana", "author": "David Ireland; Giovanni Montana", "abstract": "Combinatorial Optimisation problems arise in several application domains and are often formulated in terms of graphs. Many of these problems are NP-hard, but exact solutions are not always needed. Several heuristics have been developed to provide near-optimal solutions; however, they do not typically scale well with the size of the graph. We propose a low-complexity approach for identifying a (possibly much smaller) subgraph of the original graph where the heuristics can be run in reasonable time and with a high likelihood of finding a global near-optimal solution. The core component of our approach is LeNSE, a reinforcement learning algorithm that learns how to navigate the space of possible subgraphs using an Euclidean subgraph embedding as its map. To solve CO problems, LeNSE is provided with a discriminative embedding trained using any existing heuristics using only on a small portion of the original graph. When tested on three problems (vertex cover, max-cut and influence maximisation) using real graphs with up to $10$ million edges, LeNSE identifies small subgraphs yielding solutions comparable to those found by running the heuristics on the entire graph, but at a fraction of the total run time. Code for the experiments is available in the public GitHub repo at https://github.com/davidireland3/LeNSE.", "bibtex": "@InProceedings{pmlr-v162-ireland22a,\n title = \t {{L}e{NSE}: Learning To Navigate Subgraph Embeddings for Large-Scale Combinatorial Optimisation},\n author = {Ireland, David and Montana, Giovanni},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9622--9638},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ireland22a/ireland22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ireland22a.html},\n abstract = \t {Combinatorial Optimisation problems arise in several application domains and are often formulated in terms of graphs. Many of these problems are NP-hard, but exact solutions are not always needed. Several heuristics have been developed to provide near-optimal solutions; however, they do not typically scale well with the size of the graph. We propose a low-complexity approach for identifying a (possibly much smaller) subgraph of the original graph where the heuristics can be run in reasonable time and with a high likelihood of finding a global near-optimal solution. The core component of our approach is LeNSE, a reinforcement learning algorithm that learns how to navigate the space of possible subgraphs using an Euclidean subgraph embedding as its map. To solve CO problems, LeNSE is provided with a discriminative embedding trained using any existing heuristics using only on a small portion of the original graph. When tested on three problems (vertex cover, max-cut and influence maximisation) using real graphs with up to $10$ million edges, LeNSE identifies small subgraphs yielding solutions comparable to those found by running the heuristics on the entire graph, but at a fraction of the total run time. Code for the experiments is available in the public GitHub repo at https://github.com/davidireland3/LeNSE.}\n}", "pdf": "https://proceedings.mlr.press/v162/ireland22a/ireland22a.pdf", "supp": "", "pdf_size": 873342, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7267816984726307573&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Warwick Manufacturing Group, University of Warwick, Coventry, United Kingdom+Department of Statistics, University of Warwick, Coventry, United Kingdom; Department of Statistics, University of Warwick, Coventry, United Kingdom", "aff_domain": "warwick.ac.uk;warwick.ac.uk", "email": "warwick.ac.uk;warwick.ac.uk", "github": "https://github.com/davidireland3/LeNSE", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/ireland22a.html", "aff_unique_index": "0+0;0", "aff_unique_norm": "University of Warwick", "aff_unique_dep": "Warwick Manufacturing Group", "aff_unique_url": "https://www.warwick.ac.uk", "aff_unique_abbr": "Warwick", "aff_campus_unique_index": "0+0;0", "aff_campus_unique": "Coventry", "aff_country_unique_index": "0+0;0", "aff_country_unique": "United Kingdom" }, { "title": "Learning Augmented Binary Search Trees", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16743", "id": "16743", "proceeding": "https://proceedings.mlr.press/v162/lin22f.html", "poster": "", "slides": "", "author_site": "Honghao Lin, Tian Luo, David Woodruff", "author": "Honghao Lin; Tian Luo; David Woodruff", "abstract": "A treap is a classic randomized binary search tree data structure that is easy to implement and supports O(log n) expected time access. However, classic treaps do not take advantage of the input distribution or patterns in the input. Given recent advances in algorithms with predictions, we propose pairing treaps with machine advice to form a learning-augmented treap. We are the first to propose a learning-augmented data structure that supports binary search tree operations such as range-query and successor functionalities. With the assumption that we have access to advice from a frequency estimation oracle, we assign learned priorities to the nodes to better improve the treap\u2019s structure. We theoretically analyze the learning-augmented treap\u2019s performance under various input distributions and show that under those circumstances, our learning-augmented treap has stronger guarantees than classic treaps and other classic tree-based data structures. Further, we experimentally evaluate our learned treap on synthetic datasets and demonstrate a performance advantage over other search tree data structures. We also present experiments on real world datasets with known frequency estimation oracles and show improvements as well.", "bibtex": "@InProceedings{pmlr-v162-lin22f,\n title = \t {Learning Augmented Binary Search Trees},\n author = {Lin, Honghao and Luo, Tian and Woodruff, David},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13431--13440},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lin22f/lin22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/lin22f.html},\n abstract = \t {A treap is a classic randomized binary search tree data structure that is easy to implement and supports O(log n) expected time access. However, classic treaps do not take advantage of the input distribution or patterns in the input. Given recent advances in algorithms with predictions, we propose pairing treaps with machine advice to form a learning-augmented treap. We are the first to propose a learning-augmented data structure that supports binary search tree operations such as range-query and successor functionalities. With the assumption that we have access to advice from a frequency estimation oracle, we assign learned priorities to the nodes to better improve the treap\u2019s structure. We theoretically analyze the learning-augmented treap\u2019s performance under various input distributions and show that under those circumstances, our learning-augmented treap has stronger guarantees than classic treaps and other classic tree-based data structures. Further, we experimentally evaluate our learned treap on synthetic datasets and demonstrate a performance advantage over other search tree data structures. We also present experiments on real world datasets with known frequency estimation oracles and show improvements as well.}\n}", "pdf": "https://proceedings.mlr.press/v162/lin22f/lin22f.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/lin22f-supp.zip", "pdf_size": 509564, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17377530676648083749&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Computer Science Department, Carnegie Mellon University, Pittsburgh, PA, USA; Computer Science Department, Carnegie Mellon University, Pittsburgh, PA, USA; Computer Science Department, Carnegie Mellon University, Pittsburgh, PA, USA", "aff_domain": "andrew.cmu.edu;andrew.cmu.edu;andrew.cmu.edu", "email": "andrew.cmu.edu;andrew.cmu.edu;andrew.cmu.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/lin22f.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Bellman Complete Representations for Offline Policy Evaluation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16801", "id": "16801", "proceeding": "https://proceedings.mlr.press/v162/chang22b.html", "poster": "/media/PosterPDFs/ICML%202022/bc3ea21614e0fdc4359bdd4d3315313f.png?t=1657808898.1739562", "slides": "", "author_site": "Jonathan Chang, Kaiwen Wang, Nathan Kallus, Wen Sun", "author": "Jonathan Chang; Kaiwen Wang; Nathan Kallus; Wen Sun", "abstract": "We study representation learning for Offline Reinforcement Learning (RL), focusing on the important task of Offline Policy Evaluation (OPE). Recent work shows that, in contrast to supervised learning, realizability of the Q-function is not enough for learning it. Two sufficient conditions for sample-efficient OPE are Bellman completeness and coverage. Prior work often assumes that representations satisfying these conditions are given, with results being mostly theoretical in nature. In this work, we propose BCRL, which directly learns from data an approximately linear Bellman complete representation with good coverage. With this learned representation, we perform OPE using Least Square Policy Evaluation (LSPE) with linear functions in our learned representation. We present an end-to-end theoretical analysis, showing that our two-stage algorithm enjoys polynomial sample complexity provided some representation in the rich class considered is linear Bellman complete. Empirically, we extensively evaluate our algorithm on challenging, image-based continuous control tasks from the Deepmind Control Suite. We show our representation enables better OPE compared to previous representation learning methods developed for off-policy RL (e.g., CURL, SPR). BCRL achieve competitive OPE error with the state-of-the-art method Fitted Q-Evaluation (FQE), and beats FQE when evaluating beyond the initial state distribution. Our ablations show that both linear Bellman complete and coverage components of our method are crucial.", "bibtex": "@InProceedings{pmlr-v162-chang22b,\n title = \t {Learning {B}ellman Complete Representations for Offline Policy Evaluation},\n author = {Chang, Jonathan and Wang, Kaiwen and Kallus, Nathan and Sun, Wen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2938--2971},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chang22b/chang22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/chang22b.html},\n abstract = \t {We study representation learning for Offline Reinforcement Learning (RL), focusing on the important task of Offline Policy Evaluation (OPE). Recent work shows that, in contrast to supervised learning, realizability of the Q-function is not enough for learning it. Two sufficient conditions for sample-efficient OPE are Bellman completeness and coverage. Prior work often assumes that representations satisfying these conditions are given, with results being mostly theoretical in nature. In this work, we propose BCRL, which directly learns from data an approximately linear Bellman complete representation with good coverage. With this learned representation, we perform OPE using Least Square Policy Evaluation (LSPE) with linear functions in our learned representation. We present an end-to-end theoretical analysis, showing that our two-stage algorithm enjoys polynomial sample complexity provided some representation in the rich class considered is linear Bellman complete. Empirically, we extensively evaluate our algorithm on challenging, image-based continuous control tasks from the Deepmind Control Suite. We show our representation enables better OPE compared to previous representation learning methods developed for off-policy RL (e.g., CURL, SPR). BCRL achieve competitive OPE error with the state-of-the-art method Fitted Q-Evaluation (FQE), and beats FQE when evaluating beyond the initial state distribution. Our ablations show that both linear Bellman complete and coverage components of our method are crucial.}\n}", "pdf": "https://proceedings.mlr.press/v162/chang22b/chang22b.pdf", "supp": "", "pdf_size": 1156927, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6803502920630786381&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Computer Science, Cornell University; Computer Science, Cornell University; Operations Research and Information Engineering, Cornell Tech; Computer Science, Cornell University", "aff_domain": "cornell.edu;cornell.edu;cornell.edu;cornell.edu", "email": "cornell.edu;cornell.edu;cornell.edu;cornell.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/chang22b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "Computer Science", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cornell Tech", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Domain Adaptive Object Detection with Probabilistic Teacher", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16029", "id": "16029", "proceeding": "https://proceedings.mlr.press/v162/chen22b.html", "poster": "/media/PosterPDFs/ICML%202022/3fab5890d8113d0b5a4178201dc842ad_JbnUa9W.png?t=1658079602.23556", "slides": "/media/icml-2022/Slides/16029_GRhUD1C.pdf", "author_site": "Meilin Chen, Weijie Chen, Shicai Yang, Jie Song, Xinchao Wang, Lei Zhang, Yunfeng Yan, Donglian Qi, Yueting Zhuang, Di Xie, Shiliang Pu", "author": "Meilin Chen; Weijie Chen; Shicai Yang; Jie Song; Xinchao Wang; Lei Zhang; Yunfeng Yan; Donglian Qi; Yueting Zhuang; Di Xie; Shiliang Pu", "abstract": "Self-training for unsupervised domain adaptive object detection is a challenging task, of which the performance depends heavily on the quality of pseudo boxes. Despite the promising results, prior works have largely overlooked the uncertainty of pseudo boxes during self-training. In this paper, we present a simple yet effective framework, termed as Probabilistic Teacher (PT), which aims to capture the uncertainty of unlabeled target data from a gradually evolving teacher and guides the learning of a student in a mutually beneficial manner. Specifically, we propose to leverage the uncertainty-guided consistency training to promote classification adaptation and localization adaptation, rather than filtering pseudo boxes via an elaborate confidence threshold. In addition, we conduct anchor adaptation in parallel with localization adaptation, since anchor can be regarded as a learnable parameter. Together with this framework, we also present a novel Entropy Focal Loss (EFL) to further facilitate the uncertainty-guided self-training. Equipped with EFL, PT outperforms all previous baselines by a large margin and achieve new state-of-the-arts.", "bibtex": "@InProceedings{pmlr-v162-chen22b,\n title = \t {Learning Domain Adaptive Object Detection with Probabilistic Teacher},\n author = {Chen, Meilin and Chen, Weijie and Yang, Shicai and Song, Jie and Wang, Xinchao and Zhang, Lei and Yan, Yunfeng and Qi, Donglian and Zhuang, Yueting and Xie, Di and Pu, Shiliang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3040--3055},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22b/chen22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22b.html},\n abstract = \t {Self-training for unsupervised domain adaptive object detection is a challenging task, of which the performance depends heavily on the quality of pseudo boxes. Despite the promising results, prior works have largely overlooked the uncertainty of pseudo boxes during self-training. In this paper, we present a simple yet effective framework, termed as Probabilistic Teacher (PT), which aims to capture the uncertainty of unlabeled target data from a gradually evolving teacher and guides the learning of a student in a mutually beneficial manner. Specifically, we propose to leverage the uncertainty-guided consistency training to promote classification adaptation and localization adaptation, rather than filtering pseudo boxes via an elaborate confidence threshold. In addition, we conduct anchor adaptation in parallel with localization adaptation, since anchor can be regarded as a learnable parameter. Together with this framework, we also present a novel Entropy Focal Loss (EFL) to further facilitate the uncertainty-guided self-training. Equipped with EFL, PT outperforms all previous baselines by a large margin and achieve new state-of-the-arts.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22b/chen22b.pdf", "supp": "", "pdf_size": 16128341, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17755903452096200771&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Zhejiang University; Zhejiang University+Hikvision Research Institute; Hikvision Research Institute; Zhejiang University; National University of Singapore; Chongqing University; Zhejiang University; Hainan Institute of Zhejiang University; Zhejiang University; Hikvision Research Institute; Hikvision Research Institute", "aff_domain": "zju.edu.cn;hikvision.com; ; ; ; ; ; ; ; ;", "email": "zju.edu.cn;hikvision.com; ; ; ; ; ; ; ; ;", "github": "https://github.com/hikvision-research/ProbabilisticTeacher", "project": "", "author_num": 11, "oa": "https://proceedings.mlr.press/v162/chen22b.html", "aff_unique_index": "0;0+1;1;0;2;3;0;0;0;1;1", "aff_unique_norm": "Zhejiang University;Hikvision Research Institute;National University of Singapore;Chongqing University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.zju.edu.cn;https://www.hikvision.com/cn/;https://www.nus.edu.sg;https://www.cqu.edu.cn", "aff_unique_abbr": "ZJU;Hikvision;NUS;CQU", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Hainan", "aff_country_unique_index": "0;0+0;0;0;1;0;0;0;0;0;0", "aff_country_unique": "China;Singapore" }, { "title": "Learning Dynamics and Generalization in Deep Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17489", "id": "17489", "proceeding": "https://proceedings.mlr.press/v162/lyle22a.html", "poster": "/media/PosterPDFs/ICML%202022/ac8cd1808dc30460a81cab1b0e6652fa.png?t=1657636610.2176676", "slides": "", "author_site": "Clare Lyle, Mark Rowland, Will Dabney, Marta Kwiatkowska, Yarin Gal", "author": "Clare Lyle; Mark Rowland; Will Dabney; Marta Kwiatkowska; Yarin Gal", "abstract": "Solving a reinforcement learning (RL) problem poses two competing challenges: fitting a potentially discontinuous value function, and generalizing well to new observations. In this paper, we analyze the learning dynamics of temporal difference algorithms to gain novel insight into the tension between these two objectives. We show theoretically that temporal difference learning encourages agents to fit non-smooth components of the value function early in training, and at the same time induces the second-order effect of discouraging generalization. We corroborate these findings in deep RL agents trained on a range of environments, finding that neural networks trained using temporal difference algorithms on dense reward tasks exhibit weaker generalization between states than randomly initialized networks and networks trained with policy gradient methods. Finally, we investigate how post-training policy distillation may avoid this pitfall, and show that this approach improves generalization to novel environments in the ProcGen suite and improves robustness to input perturbations.", "bibtex": "@InProceedings{pmlr-v162-lyle22a,\n title = \t {Learning Dynamics and Generalization in Deep Reinforcement Learning},\n author = {Lyle, Clare and Rowland, Mark and Dabney, Will and Kwiatkowska, Marta and Gal, Yarin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14560--14581},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lyle22a/lyle22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lyle22a.html},\n abstract = \t {Solving a reinforcement learning (RL) problem poses two competing challenges: fitting a potentially discontinuous value function, and generalizing well to new observations. In this paper, we analyze the learning dynamics of temporal difference algorithms to gain novel insight into the tension between these two objectives. We show theoretically that temporal difference learning encourages agents to fit non-smooth components of the value function early in training, and at the same time induces the second-order effect of discouraging generalization. We corroborate these findings in deep RL agents trained on a range of environments, finding that neural networks trained using temporal difference algorithms on dense reward tasks exhibit weaker generalization between states than randomly initialized networks and networks trained with policy gradient methods. Finally, we investigate how post-training policy distillation may avoid this pitfall, and show that this approach improves generalization to novel environments in the ProcGen suite and improves robustness to input perturbations.}\n}", "pdf": "https://proceedings.mlr.press/v162/lyle22a/lyle22a.pdf", "supp": "", "pdf_size": 3367947, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4573933009459082015&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Department of Computer Science, University of Oxford; DeepMind; DeepMind; Department of Computer Science, University of Oxford; Department of Computer Science, University of Oxford", "aff_domain": "cs.ox.ac.uk; ; ; ; ", "email": "cs.ox.ac.uk; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/lyle22a.html", "aff_unique_index": "0;1;1;0;0", "aff_unique_norm": "University of Oxford;DeepMind", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.ox.ac.uk;https://deepmind.com", "aff_unique_abbr": "Oxford;DeepMind", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Learning Efficient and Robust Ordinary Differential Equations via Invertible Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17277", "id": "17277", "proceeding": "https://proceedings.mlr.press/v162/zhi22a.html", "poster": "", "slides": "", "author_site": "Weiming Zhi, Tin Lai, Lionel Ott, Edwin V Bonilla, Fabio Ramos", "author": "Weiming Zhi; Tin Lai; Lionel Ott; Edwin V. Bonilla; Fabio Ramos", "abstract": "Advances in differentiable numerical integrators have enabled the use of gradient descent techniques to learn ordinary differential equations (ODEs), where a flexible function approximator (often a neural network) is used to estimate the system dynamics, given as a time derivative. However, these integrators can be unsatisfactorily slow and unstable when learning systems of ODEs from long sequences. We propose to learn an ODE of interest from data by viewing its dynamics as a vector field related to another base vector field via a diffeomorphism (i.e., a differentiable bijection), represented by an invertible neural network (INN). By learning both the INN and the dynamics of the base ODE, we provide an avenue to offload some of the complexity in modelling the dynamics directly on to the INN. Consequently, by restricting the base ODE to be amenable to integration, we can speed up and improve the robustness of integrating trajectories from the learned system. We demonstrate the efficacy of our method in training and evaluating benchmark ODE systems, as well as within continuous-depth neural networks models. We show that our approach attains speed-ups of up to two orders of magnitude when integrating learned ODEs.", "bibtex": "@InProceedings{pmlr-v162-zhi22a,\n title = \t {Learning Efficient and Robust Ordinary Differential Equations via Invertible Neural Networks},\n author = {Zhi, Weiming and Lai, Tin and Ott, Lionel and Bonilla, Edwin V. and Ramos, Fabio},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27060--27074},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhi22a/zhi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhi22a.html},\n abstract = \t {Advances in differentiable numerical integrators have enabled the use of gradient descent techniques to learn ordinary differential equations (ODEs), where a flexible function approximator (often a neural network) is used to estimate the system dynamics, given as a time derivative. However, these integrators can be unsatisfactorily slow and unstable when learning systems of ODEs from long sequences. We propose to learn an ODE of interest from data by viewing its dynamics as a vector field related to another base vector field via a diffeomorphism (i.e., a differentiable bijection), represented by an invertible neural network (INN). By learning both the INN and the dynamics of the base ODE, we provide an avenue to offload some of the complexity in modelling the dynamics directly on to the INN. Consequently, by restricting the base ODE to be amenable to integration, we can speed up and improve the robustness of integrating trajectories from the learned system. We demonstrate the efficacy of our method in training and evaluating benchmark ODE systems, as well as within continuous-depth neural networks models. We show that our approach attains speed-ups of up to two orders of magnitude when integrating learned ODEs.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhi22a/zhi22a.pdf", "supp": "", "pdf_size": 2913308, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18007372734501837324&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "School of Computer Science, the University of Sydney, Australia; School of Computer Science, the University of Sydney, Australia; Autonomous Systems Lab, ETH Zurich, Switzerland; CSIRO\u2019s Data61, Australia; School of Computer Science, the University of Sydney, Australia + NVIDIA, USA", "aff_domain": "sydney.edu.au; ; ; ; ", "email": "sydney.edu.au; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zhi22a.html", "aff_unique_index": "0;0;1;2;0+3", "aff_unique_norm": "University of Sydney;ETH Zurich;CSIRO;NVIDIA", "aff_unique_dep": "School of Computer Science;Autonomous Systems Lab;Data61;NVIDIA", "aff_unique_url": "https://www.sydney.edu.au;https://www.ethz.ch;https://www.csiro.au;https://www.nvidia.com", "aff_unique_abbr": "USYD;ETHZ;CSIRO;NV", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Sydney;", "aff_country_unique_index": "0;0;1;0;0+2", "aff_country_unique": "Australia;Switzerland;United States" }, { "title": "Learning General Halfspaces with Adversarial Label Noise via Online Gradient Descent", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17943", "id": "17943", "proceeding": "https://proceedings.mlr.press/v162/diakonikolas22b.html", "poster": "", "slides": "", "author_site": "Ilias Diakonikolas, Vasilis Kontonis, Christos Tzamos, Nikos Zarifis", "author": "Ilias Diakonikolas; Vasilis Kontonis; Christos Tzamos; Nikos Zarifis", "abstract": "We study the problem of learning general {\u2014} i.e., not necessarily homogeneous {\u2014} halfspaces with adversarial label noise under the Gaussian distribution. Prior work has provided a sophisticated polynomial-time algorithm for this problem. In this work, we show that the problem can be solved directly via online gradient descent applied to a sequence of natural non-convex surrogates. This approach yields a simple iterative learning algorithm for general halfspaces with near-optimal sample complexity, runtime, and error guarantee. At the conceptual level, our work establishes an intriguing connection between learning halfspaces with adversarial noise and online optimization that may find other applications.", "bibtex": "@InProceedings{pmlr-v162-diakonikolas22b,\n title = \t {Learning General Halfspaces with Adversarial Label Noise via Online Gradient Descent},\n author = {Diakonikolas, Ilias and Kontonis, Vasilis and Tzamos, Christos and Zarifis, Nikos},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5118--5141},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/diakonikolas22b/diakonikolas22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/diakonikolas22b.html},\n abstract = \t {We study the problem of learning general {\u2014} i.e., not necessarily homogeneous {\u2014} halfspaces with adversarial label noise under the Gaussian distribution. Prior work has provided a sophisticated polynomial-time algorithm for this problem. In this work, we show that the problem can be solved directly via online gradient descent applied to a sequence of natural non-convex surrogates. This approach yields a simple iterative learning algorithm for general halfspaces with near-optimal sample complexity, runtime, and error guarantee. At the conceptual level, our work establishes an intriguing connection between learning halfspaces with adversarial noise and online optimization that may find other applications.}\n}", "pdf": "https://proceedings.mlr.press/v162/diakonikolas22b/diakonikolas22b.pdf", "supp": "", "pdf_size": 570370, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10236601334482000824&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Computer Sciences, University of Wisconsin, Madison, Wisconsin, USA; Department of Computer Sciences, University of Wisconsin, Madison, Wisconsin, USA; Department of Computer Sciences, University of Wisconsin, Madison, Wisconsin, USA; Department of Computer Sciences, University of Wisconsin, Madison, Wisconsin, USA", "aff_domain": "wisc.edu;wisc.edu; ; ", "email": "wisc.edu;wisc.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/diakonikolas22b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "Department of Computer Sciences", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Infinite-horizon Average-reward Markov Decision Process with Constraints", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17667", "id": "17667", "proceeding": "https://proceedings.mlr.press/v162/chen22i.html", "poster": "/media/PosterPDFs/ICML%202022/d756d3d2b9dac72449a6a6926534558a.png?t=1657184523.0448089", "slides": "", "author_site": "Liyu Chen, Rahul Jain, Haipeng Luo", "author": "Liyu Chen; Rahul Jain; Haipeng Luo", "abstract": "We study regret minimization for infinite-horizon average-reward Markov Decision Processes (MDPs) under cost constraints. We start by designing a policy optimization algorithm with carefully designed action-value estimator and bonus term, and show that for ergodic MDPs, our algorithm ensures $O(\\sqrt{T})$ regret and constant constraint violation, where $T$ is the total number of time steps. This strictly improves over the algorithm of (Singh et al., 2020), whose regret and constraint violation are both $O(T^{2/3})$. Next, we consider the most general class of weakly communicating MDPs. Through a finite-horizon approximation, we develop another algorithm with $O(T^{2/3})$ regret and constraint violation, which can be further improved to $O(\\sqrt{T})$ via a simple modification, albeit making the algorithm computationally inefficient. As far as we know, these are the first set of provable algorithms for weakly communicating MDPs with cost constraints.", "bibtex": "@InProceedings{pmlr-v162-chen22i,\n title = \t {Learning Infinite-horizon Average-reward {M}arkov Decision Process with Constraints},\n author = {Chen, Liyu and Jain, Rahul and Luo, Haipeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3246--3270},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22i/chen22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22i.html},\n abstract = \t {We study regret minimization for infinite-horizon average-reward Markov Decision Processes (MDPs) under cost constraints. We start by designing a policy optimization algorithm with carefully designed action-value estimator and bonus term, and show that for ergodic MDPs, our algorithm ensures $O(\\sqrt{T})$ regret and constant constraint violation, where $T$ is the total number of time steps. This strictly improves over the algorithm of (Singh et al., 2020), whose regret and constraint violation are both $O(T^{2/3})$. Next, we consider the most general class of weakly communicating MDPs. Through a finite-horizon approximation, we develop another algorithm with $O(T^{2/3})$ regret and constraint violation, which can be further improved to $O(\\sqrt{T})$ via a simple modification, albeit making the algorithm computationally inefficient. As far as we know, these are the first set of provable algorithms for weakly communicating MDPs with cost constraints.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22i/chen22i.pdf", "supp": "", "pdf_size": 611734, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=778400730218345411&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "University of Southern California; University of Southern California; University of Southern California", "aff_domain": "usc.edu; ; ", "email": "usc.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22i.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Iterative Reasoning through Energy Minimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17507", "id": "17507", "proceeding": "https://proceedings.mlr.press/v162/du22d.html", "poster": "", "slides": "", "author_site": "Yilun Du, Shuang Li, Josh Tenenbaum, Igor Mordatch", "author": "Yilun Du; Shuang Li; Joshua Tenenbaum; Igor Mordatch", "abstract": "Deep learning has excelled on complex pattern recognition tasks such as image classification and object recognition. However, it struggles with tasks requiring nontrivial reasoning, such as algorithmic computation. Humans are able to solve such tasks through iterative reasoning \u2013 spending more time to think about harder tasks. Most existing neural networks, however, exhibit a fixed computational budget controlled by the neural network architecture, preventing additional computational processing on harder tasks. In this work, we present a new framework for iterative reasoning with neural networks. We train a neural network to parameterize an energy landscape over all outputs, and implement each step of the iterative reasoning as an energy minimization step to find a minimal energy solution. By formulating reasoning as an energy minimization problem, for harder problems that lead to more complex energy landscapes, we may then adjust our underlying computational budget by running a more complex optimization procedure. We empirically illustrate that our iterative reasoning approach can solve more accurate and generalizable algorithmic reasoning tasks in both graph and continuous domains. Finally, we illustrate that our approach can recursively solve algorithmic problems requiring nested reasoning.", "bibtex": "@InProceedings{pmlr-v162-du22d,\n title = \t {Learning Iterative Reasoning through Energy Minimization},\n author = {Du, Yilun and Li, Shuang and Tenenbaum, Joshua and Mordatch, Igor},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5570--5582},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/du22d/du22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/du22d.html},\n abstract = \t {Deep learning has excelled on complex pattern recognition tasks such as image classification and object recognition. However, it struggles with tasks requiring nontrivial reasoning, such as algorithmic computation. Humans are able to solve such tasks through iterative reasoning \u2013 spending more time to think about harder tasks. Most existing neural networks, however, exhibit a fixed computational budget controlled by the neural network architecture, preventing additional computational processing on harder tasks. In this work, we present a new framework for iterative reasoning with neural networks. We train a neural network to parameterize an energy landscape over all outputs, and implement each step of the iterative reasoning as an energy minimization step to find a minimal energy solution. By formulating reasoning as an energy minimization problem, for harder problems that lead to more complex energy landscapes, we may then adjust our underlying computational budget by running a more complex optimization procedure. We empirically illustrate that our iterative reasoning approach can solve more accurate and generalizable algorithmic reasoning tasks in both graph and continuous domains. Finally, we illustrate that our approach can recursively solve algorithmic problems requiring nested reasoning.}\n}", "pdf": "https://proceedings.mlr.press/v162/du22d/du22d.pdf", "supp": "", "pdf_size": 2421799, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1554477033097529382&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": "MIT CSAIL; MIT CSAIL; MIT CSAIL; Google Brain", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "https://energy-based-model.github.io/iterative-reasoning-as-energy-minimization/", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/du22d.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Google", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;Google Brain", "aff_unique_url": "https://www.csail.mit.edu;https://brain.google.com", "aff_unique_abbr": "MIT CSAIL;Google Brain", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Cambridge;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Markov Games with Adversarial Opponents: Efficient Algorithms and Fundamental Limits", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17103", "id": "17103", "proceeding": "https://proceedings.mlr.press/v162/liu22r.html", "poster": "/media/PosterPDFs/ICML%202022/7a1d9028a78f418cb8f01909a348d9b2.png?t=1657896944.9490862", "slides": "", "author_site": "Qinghua Liu, Yuanhao Wang, Chi Jin", "author": "Qinghua Liu; Yuanhao Wang; Chi Jin", "abstract": "An ideal strategy in zero-sum games should not only grant the player an average reward no less than the value of Nash equilibrium, but also exploit the (adaptive) opponents when they are suboptimal. While most existing works in Markov games focus exclusively on the former objective, it remains open whether we can achieve both objectives simultaneously. To address this problem, this work studies no-regret learning in Markov games with adversarial opponents when competing against the best fixed policy in hindsight. Along this direction, we present a new complete set of positive and negative results: When the policies of the opponents are revealed at the end of each episode, we propose new efficient algorithms achieving $\\sqrt{K}$ regret bounds when either (1) the baseline policy class is small or (2) the opponent\u2019s policy class is small. This is complemented with an exponential lower bound when neither conditions are true. When the policies of the opponents are not revealed, we prove a statistical hardness result even in the most favorable scenario when both above conditions are true. Our hardness result is much stronger than the existing hardness results which either only involve computational hardness, or require further restrictions on the algorithms.", "bibtex": "@InProceedings{pmlr-v162-liu22r,\n title = \t {Learning {M}arkov Games with Adversarial Opponents: Efficient Algorithms and Fundamental Limits},\n author = {Liu, Qinghua and Wang, Yuanhao and Jin, Chi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14036--14053},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22r/liu22r.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22r.html},\n abstract = \t {An ideal strategy in zero-sum games should not only grant the player an average reward no less than the value of Nash equilibrium, but also exploit the (adaptive) opponents when they are suboptimal. While most existing works in Markov games focus exclusively on the former objective, it remains open whether we can achieve both objectives simultaneously. To address this problem, this work studies no-regret learning in Markov games with adversarial opponents when competing against the best fixed policy in hindsight. Along this direction, we present a new complete set of positive and negative results: When the policies of the opponents are revealed at the end of each episode, we propose new efficient algorithms achieving $\\sqrt{K}$ regret bounds when either (1) the baseline policy class is small or (2) the opponent\u2019s policy class is small. This is complemented with an exponential lower bound when neither conditions are true. When the policies of the opponents are not revealed, we prove a statistical hardness result even in the most favorable scenario when both above conditions are true. Our hardness result is much stronger than the existing hardness results which either only involve computational hardness, or require further restrictions on the algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22r/liu22r.pdf", "supp": "", "pdf_size": 628337, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10404859710633921789&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Princeton University; Princeton University; Princeton University", "aff_domain": "princeton.edu; ; ", "email": "princeton.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/liu22r.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Mixtures of Linear Dynamical Systems", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15993", "id": "15993", "proceeding": "https://proceedings.mlr.press/v162/chen22t.html", "poster": "/media/PosterPDFs/ICML%202022/ad71c82b22f4f65b9398f76d8be4c615.png?t=1657473380.5777075", "slides": "", "author_site": "Yanxi Chen, H. Vincent Poor", "author": "Yanxi Chen; H. Vincent Poor", "abstract": "We study the problem of learning a mixture of multiple linear dynamical systems (LDSs) from unlabeled short sample trajectories, each generated by one of the LDS models. Despite the wide applicability of mixture models for time-series data, learning algorithms that come with end-to-end performance guarantees are largely absent from existing literature. There are multiple sources of technical challenges, including but not limited to (1) the presence of latent variables (i.e. the unknown labels of trajectories); (2) the possibility that the sample trajectories might have lengths much smaller than the dimension $d$ of the LDS models; and (3) the complicated temporal dependence inherent to time-series data. To tackle these challenges, we develop a two-stage meta-algorithm, which is guaranteed to efficiently recover each ground-truth LDS model up to error $\\tilde{O}(\\sqrt{d/T})$, where $T$ is the total sample size. We validate our theoretical studies with numerical experiments, confirming the efficacy of the proposed algorithm.", "bibtex": "@InProceedings{pmlr-v162-chen22t,\n title = \t {Learning Mixtures of Linear Dynamical Systems},\n author = {Chen, Yanxi and Poor, H. Vincent},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3507--3557},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22t/chen22t.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22t.html},\n abstract = \t {We study the problem of learning a mixture of multiple linear dynamical systems (LDSs) from unlabeled short sample trajectories, each generated by one of the LDS models. Despite the wide applicability of mixture models for time-series data, learning algorithms that come with end-to-end performance guarantees are largely absent from existing literature. There are multiple sources of technical challenges, including but not limited to (1) the presence of latent variables (i.e. the unknown labels of trajectories); (2) the possibility that the sample trajectories might have lengths much smaller than the dimension $d$ of the LDS models; and (3) the complicated temporal dependence inherent to time-series data. To tackle these challenges, we develop a two-stage meta-algorithm, which is guaranteed to efficiently recover each ground-truth LDS model up to error $\\tilde{O}(\\sqrt{d/T})$, where $T$ is the total sample size. We validate our theoretical studies with numerical experiments, confirming the efficacy of the proposed algorithm.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22t/chen22t.pdf", "supp": "", "pdf_size": 1131538, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8155872212414990084&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Electrical and Computer Engineering, Princeton University; Department of Electrical and Computer Engineering, Princeton University", "aff_domain": "princeton.edu; ", "email": "princeton.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/chen22t.html", "aff_unique_index": "0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning Multiscale Transformer Models for Sequence Generation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17447", "id": "17447", "proceeding": "https://proceedings.mlr.press/v162/li22ac.html", "poster": "/media/PosterPDFs/ICML%202022/567b8f5f423af15818a068235807edc0.png?t=1656207914.0299027", "slides": "", "author_site": "Bei Li, Tong Zheng, yi jing, Chengbo Jiao, Tong Xiao, Jingbo Zhu", "author": "Bei Li; Tong Zheng; Yi Jing; Chengbo Jiao; Tong Xiao; Jingbo Zhu", "abstract": "Multiscale feature hierarchies have been witnessed the success in the computer vision area. This further motivates researchers to design multiscale Transformer for natural language processing, mostly based on the self-attention mechanism. For example, restricting the receptive field across heads or extracting local fine-grained features via convolutions. However, most of existing works directly modeled local features but ignored the word-boundary information. This results in redundant and ambiguous attention distributions, which lacks of interpretability. In this work, we define those scales in different linguistic units, including sub-words, words and phrases. We built a multiscale Transformer model by establishing relationships among scales based on word-boundary information and phrase-level prior knowledge. The proposed \\textbf{U}niversal \\textbf{M}ulti\\textbf{S}cale \\textbf{T}ransformer, namely \\textsc{Umst}, was evaluated on two sequence generation tasks. Notably, it yielded consistent performance gains over the strong baseline on several test sets without sacrificing the efficiency.", "bibtex": "@InProceedings{pmlr-v162-li22ac,\n title = \t {Learning Multiscale Transformer Models for Sequence Generation},\n author = {Li, Bei and Zheng, Tong and Jing, Yi and Jiao, Chengbo and Xiao, Tong and Zhu, Jingbo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13225--13241},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22ac/li22ac.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22ac.html},\n abstract = \t {Multiscale feature hierarchies have been witnessed the success in the computer vision area. This further motivates researchers to design multiscale Transformer for natural language processing, mostly based on the self-attention mechanism. For example, restricting the receptive field across heads or extracting local fine-grained features via convolutions. However, most of existing works directly modeled local features but ignored the word-boundary information. This results in redundant and ambiguous attention distributions, which lacks of interpretability. In this work, we define those scales in different linguistic units, including sub-words, words and phrases. We built a multiscale Transformer model by establishing relationships among scales based on word-boundary information and phrase-level prior knowledge. The proposed \\textbf{U}niversal \\textbf{M}ulti\\textbf{S}cale \\textbf{T}ransformer, namely \\textsc{Umst}, was evaluated on two sequence generation tasks. Notably, it yielded consistent performance gains over the strong baseline on several test sets without sacrificing the efficiency.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22ac/li22ac.pdf", "supp": "", "pdf_size": 6508019, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10490177289793431927&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "School of Computer Science and Engineering, Northeastern University, Shenyang, China+ NiuTrans Research, Shenyang, China; School of Computer Science and Engineering, Northeastern University, Shenyang, China+ NiuTrans Research, Shenyang, China; School of Computer Science and Engineering, Northeastern University, Shenyang, China+ NiuTrans Research, Shenyang, China; NiuTrans Research, Shenyang, China; School of Computer Science and Engineering, Northeastern University, Shenyang, China+ NiuTrans Research, Shenyang, China; School of Computer Science and Engineering, Northeastern University, Shenyang, China+ NiuTrans Research, Shenyang, China", "aff_domain": "example.com;example.com;example.com;example.com;mail.neu.edu.cn;example.com", "email": "example.com;example.com;example.com;example.com;mail.neu.edu.cn;example.com", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/li22ac.html", "aff_unique_index": "0+1;0+1;0+1;1;0+1;0+1", "aff_unique_norm": "Northeastern University;NiuTrans Research", "aff_unique_dep": "School of Computer Science and Engineering;", "aff_unique_url": "http://www.neu.edu.cn/;", "aff_unique_abbr": "NEU;", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Shenyang;", "aff_country_unique_index": "0+0;0+0;0+0;0;0+0;0+0", "aff_country_unique": "China" }, { "title": "Learning Pseudometric-based Action Representations for Offline Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17585", "id": "17585", "proceeding": "https://proceedings.mlr.press/v162/gu22b.html", "poster": "", "slides": "", "author_site": "Pengjie Gu, Mengchen Zhao, Chen Chen, Dong Li, Jianye Hao, Bo An", "author": "Pengjie Gu; Mengchen Zhao; Chen Chen; Dong Li; Jianye Hao; Bo An", "abstract": "Offline reinforcement learning is a promising approach for practical applications since it does not require interactions with real-world environments. However, existing offline RL methods only work well in environments with continuous or small discrete action spaces. In environments with large and discrete action spaces, such as recommender systems and dialogue systems, the performance of existing methods decreases drastically because they suffer from inaccurate value estimation for a large proportion of out-of-distribution (o.o.d.) actions. While recent works have demonstrated that online RL benefits from incorporating semantic information in action representations, unfortunately, they fail to learn reasonable relative distances between action representations, which is key to offline RL to reduce the influence of o.o.d. actions. This paper proposes an action representation learning framework for offline RL based on a pseudometric, which measures both the behavioral relation and the data-distributional relation between actions. We provide theoretical analysis on the continuity of the expected Q-values and the offline policy improvement using the learned action representations. Experimental results show that our methods significantly improve the performance of two typical offline RL methods in environments with large and discrete action spaces.", "bibtex": "@InProceedings{pmlr-v162-gu22b,\n title = \t {Learning Pseudometric-based Action Representations for Offline Reinforcement Learning},\n author = {Gu, Pengjie and Zhao, Mengchen and Chen, Chen and Li, Dong and Hao, Jianye and An, Bo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7902--7918},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gu22b/gu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/gu22b.html},\n abstract = \t {Offline reinforcement learning is a promising approach for practical applications since it does not require interactions with real-world environments. However, existing offline RL methods only work well in environments with continuous or small discrete action spaces. In environments with large and discrete action spaces, such as recommender systems and dialogue systems, the performance of existing methods decreases drastically because they suffer from inaccurate value estimation for a large proportion of out-of-distribution (o.o.d.) actions. While recent works have demonstrated that online RL benefits from incorporating semantic information in action representations, unfortunately, they fail to learn reasonable relative distances between action representations, which is key to offline RL to reduce the influence of o.o.d. actions. This paper proposes an action representation learning framework for offline RL based on a pseudometric, which measures both the behavioral relation and the data-distributional relation between actions. We provide theoretical analysis on the continuity of the expected Q-values and the offline policy improvement using the learned action representations. Experimental results show that our methods significantly improve the performance of two typical offline RL methods in environments with large and discrete action spaces.}\n}", "pdf": "https://proceedings.mlr.press/v162/gu22b/gu22b.pdf", "supp": "", "pdf_size": 25989007, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7326857603943144470&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "School of Computer Science and Engineering, Nanyang Technological University, Singapore; Noah\u2019s Ark Lab, Huawei; Noah\u2019s Ark Lab, Huawei; Noah\u2019s Ark Lab, Huawei; College of Intelligence and Computing, Tianjin University; School of Computer Science and Engineering, Nanyang Technological University, Singapore", "aff_domain": "ntu.edu.sg;huawei.com;huawei.com;huawei.com;tju.edu.cn;ntu.edu.sg", "email": "ntu.edu.sg;huawei.com;huawei.com;huawei.com;tju.edu.cn;ntu.edu.sg", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/gu22b.html", "aff_unique_index": "0;1;1;1;2;0", "aff_unique_norm": "Nanyang Technological University;Huawei;Tianjin University", "aff_unique_dep": "School of Computer Science and Engineering;Noah\u2019s Ark Lab;College of Intelligence and Computing", "aff_unique_url": "https://www.ntu.edu.sg;https://www.huawei.com;http://www.tju.edu.cn", "aff_unique_abbr": "NTU;Huawei;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Singapore;", "aff_country_unique_index": "0;1;1;1;1;0", "aff_country_unique": "Singapore;China" }, { "title": "Learning Stable Classifiers by Transferring Unstable Features", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15997", "id": "15997", "proceeding": "https://proceedings.mlr.press/v162/bao22a.html", "poster": "/media/PosterPDFs/ICML%202022/dd458505749b2941217ddd59394240e8_adXGQbL.png?t=1657164566.8572884", "slides": "", "author_site": "Yujia Bao, Shiyu Chang, Regina Barzilay", "author": "Yujia Bao; Shiyu Chang; Dr.Regina Barzilay", "abstract": "While unbiased machine learning models are essential for many applications, bias is a human-defined concept that can vary across tasks. Given only input-label pairs, algorithms may lack sufficient information to distinguish stable (causal) features from unstable (spurious) features. However, related tasks often share similar biases \u2013 an observation we may leverage to develop stable classifiers in the transfer setting. In this work, we explicitly inform the target classifier about unstable features in the source tasks. Specifically, we derive a representation that encodes the unstable features by contrasting different data environments in the source task. We achieve robustness by clustering data of the target task according to this representation and minimizing the worst-case risk across these clusters. We evaluate our method on both text and image classifications. Empirical results demonstrate that our algorithm is able to maintain robustness on the target task for both synthetically generated environments and real-world environments. Our code is available at https://github.com/YujiaBao/Tofu.", "bibtex": "@InProceedings{pmlr-v162-bao22a,\n title = \t {Learning Stable Classifiers by Transferring Unstable Features},\n author = {Bao, Yujia and Chang, Shiyu and Barzilay, Dr.Regina},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1483--1507},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bao22a/bao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bao22a.html},\n abstract = \t {While unbiased machine learning models are essential for many applications, bias is a human-defined concept that can vary across tasks. Given only input-label pairs, algorithms may lack sufficient information to distinguish stable (causal) features from unstable (spurious) features. However, related tasks often share similar biases \u2013 an observation we may leverage to develop stable classifiers in the transfer setting. In this work, we explicitly inform the target classifier about unstable features in the source tasks. Specifically, we derive a representation that encodes the unstable features by contrasting different data environments in the source task. We achieve robustness by clustering data of the target task according to this representation and minimizing the worst-case risk across these clusters. We evaluate our method on both text and image classifications. Empirical results demonstrate that our algorithm is able to maintain robustness on the target task for both synthetically generated environments and real-world environments. Our code is available at https://github.com/YujiaBao/Tofu.}\n}", "pdf": "https://proceedings.mlr.press/v162/bao22a/bao22a.pdf", "supp": "", "pdf_size": 5246899, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13001665395610981653&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "MIT CSAIL; Computer Science, UC Santa Barbara; MIT CSAIL", "aff_domain": "csail.mit.edu; ; ", "email": "csail.mit.edu; ; ", "github": "https://github.com/YujiaBao/Tofu", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/bao22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;University of California, Santa Barbara", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;Department of Computer Science", "aff_unique_url": "https://www.csail.mit.edu;https://www.ucsb.edu", "aff_unique_abbr": "MIT CSAIL;UCSB", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Cambridge;Santa Barbara", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Stochastic Shortest Path with Linear Function Approximation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16867", "id": "16867", "proceeding": "https://proceedings.mlr.press/v162/min22a.html", "poster": "", "slides": "", "author_site": "Yifei Min, Jiafan He, Tianhao Wang, Quanquan Gu", "author": "Yifei Min; Jiafan He; Tianhao Wang; Quanquan Gu", "abstract": "We study the stochastic shortest path (SSP) problem in reinforcement learning with linear function approximation, where the transition kernel is represented as a linear mixture of unknown models. We call this class of SSP problems as linear mixture SSPs. We propose a novel algorithm with Hoeffding-type confidence sets for learning the linear mixture SSP, which can attain an $\\tilde{\\mathcal{O}}(d B_{\\star}^{1.5}\\sqrt{K/c_{\\min}})$ regret. Here $K$ is the number of episodes, $d$ is the dimension of the feature mapping in the mixture model, $B_{\\star}$ bounds the expected cumulative cost of the optimal policy, and $c_{\\min}>0$ is the lower bound of the cost function. Our algorithm also applies to the case when $c_{\\min} = 0$, and an $\\tilde{\\mathcal{O}}(K^{2/3})$ regret is guaranteed. To the best of our knowledge, this is the first algorithm with a sublinear regret guarantee for learning linear mixture SSP. Moreover, we design a refined Bernstein-type confidence set and propose an improved algorithm, which provably achieves an $\\tilde{\\mathcal{O}}(d B_{\\star}\\sqrt{K/c_{\\min}})$ regret. In complement to the regret upper bounds, we also prove a lower bound of $\\Omega(dB_{\\star} \\sqrt{K})$. Hence, our improved algorithm matches the lower bound up to a $1/\\sqrt{c_{\\min}}$ factor and poly-logarithmic factors, achieving a near-optimal regret guarantee.", "bibtex": "@InProceedings{pmlr-v162-min22a,\n title = \t {Learning Stochastic Shortest Path with Linear Function Approximation},\n author = {Min, Yifei and He, Jiafan and Wang, Tianhao and Gu, Quanquan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15584--15629},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/min22a/min22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/min22a.html},\n abstract = \t {We study the stochastic shortest path (SSP) problem in reinforcement learning with linear function approximation, where the transition kernel is represented as a linear mixture of unknown models. We call this class of SSP problems as linear mixture SSPs. We propose a novel algorithm with Hoeffding-type confidence sets for learning the linear mixture SSP, which can attain an $\\tilde{\\mathcal{O}}(d B_{\\star}^{1.5}\\sqrt{K/c_{\\min}})$ regret. Here $K$ is the number of episodes, $d$ is the dimension of the feature mapping in the mixture model, $B_{\\star}$ bounds the expected cumulative cost of the optimal policy, and $c_{\\min}>0$ is the lower bound of the cost function. Our algorithm also applies to the case when $c_{\\min} = 0$, and an $\\tilde{\\mathcal{O}}(K^{2/3})$ regret is guaranteed. To the best of our knowledge, this is the first algorithm with a sublinear regret guarantee for learning linear mixture SSP. Moreover, we design a refined Bernstein-type confidence set and propose an improved algorithm, which provably achieves an $\\tilde{\\mathcal{O}}(d B_{\\star}\\sqrt{K/c_{\\min}})$ regret. In complement to the regret upper bounds, we also prove a lower bound of $\\Omega(dB_{\\star} \\sqrt{K})$. Hence, our improved algorithm matches the lower bound up to a $1/\\sqrt{c_{\\min}}$ factor and poly-logarithmic factors, achieving a near-optimal regret guarantee.}\n}", "pdf": "https://proceedings.mlr.press/v162/min22a/min22a.pdf", "supp": "", "pdf_size": 4208908, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12589791277500777414&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Statistics and Data Science, Yale University, CT 06520, USA; Department of Computer Science, University of California, Los Angeles, CA 90095, USA; Department of Statistics and Data Science, Yale University, CT 06520, USA; Department of Computer Science, University of California, Los Angeles, CA 90095, USA", "aff_domain": "yale.edu;ucla.edu;yale.edu;cs.ucla.edu", "email": "yale.edu;ucla.edu;yale.edu;cs.ucla.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/min22a.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Yale University;University of California, Los Angeles", "aff_unique_dep": "Department of Statistics and Data Science;Department of Computer Science", "aff_unique_url": "https://www.yale.edu;https://www.ucla.edu", "aff_unique_abbr": "Yale;UCLA", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "New Haven;Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Symmetric Embeddings for Equivariant World Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17395", "id": "17395", "proceeding": "https://proceedings.mlr.press/v162/park22a.html", "poster": "", "slides": "", "author_site": "Jung Yeon Park, Ondrej Biza, Linfeng Zhao, Jan-Willem van de Meent, Robin Walters", "author": "Jung Yeon Park; Ondrej Biza; Linfeng Zhao; Jan-Willem Van De Meent; Robin Walters", "abstract": "Incorporating symmetries can lead to highly data-efficient and generalizable models by defining equivalence classes of data samples related by transformations. However, characterizing how transformations act on input data is often difficult, limiting the applicability of equivariant models. We propose learning symmetric embedding networks (SENs) that encode an input space (e.g. images), where we do not know the effect of transformations (e.g. rotations), to a feature space that transforms in a known manner under these operations. This network can be trained end-to-end with an equivariant task network to learn an explicitly symmetric representation. We validate this approach in the context of equivariant transition models with 3 distinct forms of symmetry. Our experiments demonstrate that SENs facilitate the application of equivariant networks to data with complex symmetry representations. Moreover, doing so can yield improvements in accuracy and generalization relative to both fully-equivariant and non-equivariant baselines.", "bibtex": "@InProceedings{pmlr-v162-park22a,\n title = \t {Learning Symmetric Embeddings for Equivariant World Models},\n author = {Park, Jung Yeon and Biza, Ondrej and Zhao, Linfeng and Van De Meent, Jan-Willem and Walters, Robin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17372--17389},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/park22a/park22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/park22a.html},\n abstract = \t {Incorporating symmetries can lead to highly data-efficient and generalizable models by defining equivalence classes of data samples related by transformations. However, characterizing how transformations act on input data is often difficult, limiting the applicability of equivariant models. We propose learning symmetric embedding networks (SENs) that encode an input space (e.g. images), where we do not know the effect of transformations (e.g. rotations), to a feature space that transforms in a known manner under these operations. This network can be trained end-to-end with an equivariant task network to learn an explicitly symmetric representation. We validate this approach in the context of equivariant transition models with 3 distinct forms of symmetry. Our experiments demonstrate that SENs facilitate the application of equivariant networks to data with complex symmetry representations. Moreover, doing so can yield improvements in accuracy and generalization relative to both fully-equivariant and non-equivariant baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/park22a/park22a.pdf", "supp": "", "pdf_size": 7544475, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17517971134760315540&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Khoury College of Computer Sciences, Northeastern University, Boston, MA, USA+Informatics Institute, University of Amsterdam, Amsterdam, Netherlands; Khoury College of Computer Sciences, Northeastern University, Boston, MA, USA+Informatics Institute, University of Amsterdam, Amsterdam, Netherlands; Khoury College of Computer Sciences, Northeastern University, Boston, MA, USA; Informatics Institute, University of Amsterdam, Amsterdam, Netherlands; Khoury College of Computer Sciences, Northeastern University, Boston, MA, USA", "aff_domain": "northeastern.edu;northeastern.edu; ;uva.nl;northeastern.edu", "email": "northeastern.edu;northeastern.edu; ;uva.nl;northeastern.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/park22a.html", "aff_unique_index": "0+1;0+1;0;1;0", "aff_unique_norm": "Northeastern University;University of Amsterdam", "aff_unique_dep": "Khoury College of Computer Sciences;Informatics Institute", "aff_unique_url": "https://www.northeastern.edu;https://www.uva.nl", "aff_unique_abbr": "NU;UvA", "aff_campus_unique_index": "0+1;0+1;0;1;0", "aff_campus_unique": "Boston;Amsterdam", "aff_country_unique_index": "0+1;0+1;0;1;0", "aff_country_unique": "United States;Netherlands" }, { "title": "Learning fair representation with a parametric integral probability metric", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17003", "id": "17003", "proceeding": "https://proceedings.mlr.press/v162/kim22b.html", "poster": "/media/PosterPDFs/ICML%202022/9873eaad153c6c960616c89e54fe155a_0p9ZFqs.png?t=1657603230.7826037", "slides": "", "author_site": "Dongha Kim, Kunwoong Kim, Insung Kong, Ilsang Ohn, Yongdai Kim", "author": "Dongha Kim; Kunwoong Kim; Insung Kong; Ilsang Ohn; Yongdai Kim", "abstract": "As they have a vital effect on social decision-making, AI algorithms should be not only accurate but also fair. Among various algorithms for fairness AI, learning fair representation (LFR), whose goal is to find a fair representation with respect to sensitive variables such as gender and race, has received much attention. For LFR, the adversarial training scheme is popularly employed as is done in the generative adversarial network type algorithms. The choice of a discriminator, however, is done heuristically without justification. In this paper, we propose a new adversarial training scheme for LFR, where the integral probability metric (IPM) with a specific parametric family of discriminators is used. The most notable result of the proposed LFR algorithm is its theoretical guarantee about the fairness of the final prediction model, which has not been considered yet. That is, we derive theoretical relations between the fairness of representation and the fairness of the prediction model built on the top of the representation (i.e., using the representation as the input). Moreover, by numerical experiments, we show that our proposed LFR algorithm is computationally lighter and more stable, and the final prediction model is competitive or superior to other LFR algorithms using more complex discriminators.", "bibtex": "@InProceedings{pmlr-v162-kim22b,\n title = \t {Learning fair representation with a parametric integral probability metric},\n author = {Kim, Dongha and Kim, Kunwoong and Kong, Insung and Ohn, Ilsang and Kim, Yongdai},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11074--11101},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kim22b/kim22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/kim22b.html},\n abstract = \t {As they have a vital effect on social decision-making, AI algorithms should be not only accurate but also fair. Among various algorithms for fairness AI, learning fair representation (LFR), whose goal is to find a fair representation with respect to sensitive variables such as gender and race, has received much attention. For LFR, the adversarial training scheme is popularly employed as is done in the generative adversarial network type algorithms. The choice of a discriminator, however, is done heuristically without justification. In this paper, we propose a new adversarial training scheme for LFR, where the integral probability metric (IPM) with a specific parametric family of discriminators is used. The most notable result of the proposed LFR algorithm is its theoretical guarantee about the fairness of the final prediction model, which has not been considered yet. That is, we derive theoretical relations between the fairness of representation and the fairness of the prediction model built on the top of the representation (i.e., using the representation as the input). Moreover, by numerical experiments, we show that our proposed LFR algorithm is computationally lighter and more stable, and the final prediction model is competitive or superior to other LFR algorithms using more complex discriminators.}\n}", "pdf": "https://proceedings.mlr.press/v162/kim22b/kim22b.pdf", "supp": "", "pdf_size": 2426698, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7724112263757302618&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Statistics, Sungshin Women\u2019s University+Data Science Center, Sungshin Women\u2019s University; Department of Statistics, Seoul National University; Department of Statistics, Seoul National University; Department of Statistics, Inha University; Department of Statistics, Seoul National University", "aff_domain": "example.com;example.com;example.com;example.com;gmail.com", "email": "example.com;example.com;example.com;example.com;gmail.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/kim22b.html", "aff_unique_index": "0+0;1;1;2;1", "aff_unique_norm": "Sungshin Women's University;Seoul National University;Inha University", "aff_unique_dep": "Department of Statistics;Department of Statistics;Department of Statistics", "aff_unique_url": "http://www.sungshin.ac.kr;https://www.snu.ac.kr;https://www.inha.edu/", "aff_unique_abbr": "Sungshin WU;SNU;Inha", "aff_campus_unique_index": ";1;1;1", "aff_campus_unique": ";Seoul", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Learning from Counterfactual Links for Link Prediction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16773", "id": "16773", "proceeding": "https://proceedings.mlr.press/v162/zhao22e.html", "poster": "/media/PosterPDFs/ICML%202022/f976b57bb9dd27aa2e7e7df2825893a6.png?t=1657735540.5568109", "slides": "", "author_site": "Tong Zhao, Gang Liu, Daheng Wang, Wenhao Yu, Meng Jiang", "author": "Tong Zhao; Gang Liu; Daheng Wang; Wenhao Yu; Meng Jiang", "abstract": "Learning to predict missing links is important for many graph-based applications. Existing methods were designed to learn the association between observed graph structure and existence of link between a pair of nodes. However, the causal relationship between the two variables was largely ignored for learning to predict links on a graph. In this work, we visit this factor by asking a counterfactual question: \"would the link still exist if the graph structure became different from observation?\" Its answer, counterfactual links, will be able to augment the graph data for representation learning. To create these links, we employ causal models that consider the information (i.e., learned representations) of node pairs as context, global graph structural properties as treatment, and link existence as outcome. We propose a novel data augmentation-based link prediction method that creates counterfactual links and learns representations from both the observed and counterfactual links. Experiments on benchmark data show that our graph learning method achieves state-of-the-art performance on the task of link prediction.", "bibtex": "@InProceedings{pmlr-v162-zhao22e,\n title = \t {Learning from Counterfactual Links for Link Prediction},\n author = {Zhao, Tong and Liu, Gang and Wang, Daheng and Yu, Wenhao and Jiang, Meng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26911--26926},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhao22e/zhao22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhao22e.html},\n abstract = \t {Learning to predict missing links is important for many graph-based applications. Existing methods were designed to learn the association between observed graph structure and existence of link between a pair of nodes. However, the causal relationship between the two variables was largely ignored for learning to predict links on a graph. In this work, we visit this factor by asking a counterfactual question: \"would the link still exist if the graph structure became different from observation?\" Its answer, counterfactual links, will be able to augment the graph data for representation learning. To create these links, we employ causal models that consider the information (i.e., learned representations) of node pairs as context, global graph structural properties as treatment, and link existence as outcome. We propose a novel data augmentation-based link prediction method that creates counterfactual links and learns representations from both the observed and counterfactual links. Experiments on benchmark data show that our graph learning method achieves state-of-the-art performance on the task of link prediction.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhao22e/zhao22e.pdf", "supp": "", "pdf_size": 4067800, "gs_citation": 129, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12649708640262432051&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science and Engineering, University of Notre Dame, IN, USA; Department of Computer Science and Engineering, University of Notre Dame, IN, USA; Department of Computer Science and Engineering, University of Notre Dame, IN, USA; Department of Computer Science and Engineering, University of Notre Dame, IN, USA; Department of Computer Science and Engineering, University of Notre Dame, IN, USA", "aff_domain": "nd.edu; ; ; ; ", "email": "nd.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zhao22e.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Notre Dame", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.nd.edu", "aff_unique_abbr": "Notre Dame", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Notre Dame", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning from Demonstration: Provably Efficient Adversarial Policy Imitation with Linear Function Approximation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17215", "id": "17215", "proceeding": "https://proceedings.mlr.press/v162/liu22u.html", "poster": "", "slides": "/media/icml-2022/Slides/17215.pdf", "author_site": "ZHIHAN LIU, Yufeng Zhang, Zuyue Fu, Zhuoran Yang, Zhaoran Wang", "author": "Zhihan Liu; Yufeng Zhang; Zuyue Fu; Zhuoran Yang; Zhaoran Wang", "abstract": "In generative adversarial imitation learning (GAIL), the agent aims to learn a policy from an expert demonstration so that its performance cannot be discriminated from the expert policy on a certain predefined reward set. In this paper, we study GAIL in both online and offline settings with linear function approximation, where both the transition and reward function are linear in the feature maps. Besides the expert demonstration, in the online setting the agent can interact with the environment, while in the offline setting the agent only accesses an additional dataset collected by a prior. For online GAIL, we propose an optimistic generative adversarial policy imitation algorithm (OGAPI) and prove that OGAPI achieves $\\widetilde{\\mathcal{O}}(\\sqrt{H^4d^3K}+\\sqrt{H^3d^2K^2/N_1})$ regret. Here $N_1$ represents the number of trajectories of the expert demonstration, $d$ is the feature dimension, and $K$ is the number of episodes. For offline GAIL, we propose a pessimistic generative adversarial policy imitation algorithm (PGAPI). We also obtain the optimality gap of PGAPI, achieving the minimax lower bound in the utilization of the additional dataset. Assuming sufficient coverage on the additional dataset, we show that PGAPI achieves $\\widetilde{\\mathcal{O}}(\\sqrt{H^4d^2/K}+\\sqrt{H^4d^3/N_2}+\\sqrt{H^3d^2/N_1})$ optimality gap. Here $N_2$ represents the number of trajectories of the additional dataset with sufficient coverage.", "bibtex": "@InProceedings{pmlr-v162-liu22u,\n title = \t {Learning from Demonstration: Provably Efficient Adversarial Policy Imitation with Linear Function Approximation},\n author = {Liu, Zhihan and Zhang, Yufeng and Fu, Zuyue and Yang, Zhuoran and Wang, Zhaoran},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14094--14138},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22u/liu22u.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22u.html},\n abstract = \t {In generative adversarial imitation learning (GAIL), the agent aims to learn a policy from an expert demonstration so that its performance cannot be discriminated from the expert policy on a certain predefined reward set. In this paper, we study GAIL in both online and offline settings with linear function approximation, where both the transition and reward function are linear in the feature maps. Besides the expert demonstration, in the online setting the agent can interact with the environment, while in the offline setting the agent only accesses an additional dataset collected by a prior. For online GAIL, we propose an optimistic generative adversarial policy imitation algorithm (OGAPI) and prove that OGAPI achieves $\\widetilde{\\mathcal{O}}(\\sqrt{H^4d^3K}+\\sqrt{H^3d^2K^2/N_1})$ regret. Here $N_1$ represents the number of trajectories of the expert demonstration, $d$ is the feature dimension, and $K$ is the number of episodes. For offline GAIL, we propose a pessimistic generative adversarial policy imitation algorithm (PGAPI). We also obtain the optimality gap of PGAPI, achieving the minimax lower bound in the utilization of the additional dataset. Assuming sufficient coverage on the additional dataset, we show that PGAPI achieves $\\widetilde{\\mathcal{O}}(\\sqrt{H^4d^2/K}+\\sqrt{H^4d^3/N_2}+\\sqrt{H^3d^2/N_1})$ optimality gap. Here $N_2$ represents the number of trajectories of the additional dataset with sufficient coverage.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22u/liu22u.pdf", "supp": "", "pdf_size": 583700, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1396474413839748589&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Northwestern University; Northwestern University; Northwestern University; Yale University; Northwestern University", "aff_domain": "u.northwestern.edu;u.northwestern.edu;u.northwestern.edu;yale.edu;gmail.com", "email": "u.northwestern.edu;u.northwestern.edu;u.northwestern.edu;yale.edu;gmail.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/liu22u.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Northwestern University;Yale University", "aff_unique_dep": ";", "aff_unique_url": "https://www.northwestern.edu;https://www.yale.edu", "aff_unique_abbr": "NU;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning from a Learning User for Optimal Recommendations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17983", "id": "17983", "proceeding": "https://proceedings.mlr.press/v162/yao22a.html", "poster": "/media/PosterPDFs/ICML%202022/2654d1a3f16bf62d0dc4f91fa3ec9377.png?t=1657492596.0591342", "slides": "", "author_site": "Fan Yao, Chuanhao Li, Denis Nekipelov, Hongning Wang, Haifeng Xu", "author": "Fan Yao; Chuanhao Li; Denis Nekipelov; Hongning Wang; Haifeng Xu", "abstract": "In real-world recommendation problems, especially those with a formidably large item space, users have to gradually learn to estimate the utility of any fresh recommendations from their experience about previously consumed items. This in turn affects their interaction dynamics with the system and can invalidate previous algorithms built on the omniscient user assumption. In this paper, we formalize a model to capture such \u201dlearning users\u201d and design an efficient system-side learning solution, coined Noise-Robust Active Ellipsoid Search (RAES), to confront the challenges brought by the non-stationary feedback from such a learning user. Interestingly, we prove that the regret of RAES deteriorates gracefully as the convergence rate of user learning becomes worse, until reaching linear regret when the user\u2019s learning fails to converge. Experiments on synthetic datasets demonstrate the strength of RAES for such a contemporaneous system-user learning problem. Our study provides a novel perspective on modeling the feedback loop in recommendation problems.", "bibtex": "@InProceedings{pmlr-v162-yao22a,\n title = \t {Learning from a Learning User for Optimal Recommendations},\n author = {Yao, Fan and Li, Chuanhao and Nekipelov, Denis and Wang, Hongning and Xu, Haifeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25382--25406},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yao22a/yao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yao22a.html},\n abstract = \t {In real-world recommendation problems, especially those with a formidably large item space, users have to gradually learn to estimate the utility of any fresh recommendations from their experience about previously consumed items. This in turn affects their interaction dynamics with the system and can invalidate previous algorithms built on the omniscient user assumption. In this paper, we formalize a model to capture such \u201dlearning users\u201d and design an efficient system-side learning solution, coined Noise-Robust Active Ellipsoid Search (RAES), to confront the challenges brought by the non-stationary feedback from such a learning user. Interestingly, we prove that the regret of RAES deteriorates gracefully as the convergence rate of user learning becomes worse, until reaching linear regret when the user\u2019s learning fails to converge. Experiments on synthetic datasets demonstrate the strength of RAES for such a contemporaneous system-user learning problem. Our study provides a novel perspective on modeling the feedback loop in recommendation problems.}\n}", "pdf": "https://proceedings.mlr.press/v162/yao22a/yao22a.pdf", "supp": "", "pdf_size": 663315, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17647547143717154683&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, University of Virginia, USA+Department of Economics, University of Virginia, USA; Department of Computer Science, University of Virginia, USA; Department of Economics, University of Virginia, USA; Department of Computer Science, University of Virginia, USA; Department of Computer Science, University of Virginia, USA", "aff_domain": "virginia.edu; ; ;virginia.edu;virginia.edu", "email": "virginia.edu; ; ;virginia.edu;virginia.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/yao22a.html", "aff_unique_index": "0+0;0;0;0;0", "aff_unique_norm": "University of Virginia", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.virginia.edu", "aff_unique_abbr": "UVA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning inverse folding from millions of predicted structures", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16885", "id": "16885", "proceeding": "https://proceedings.mlr.press/v162/hsu22a.html", "poster": "/media/PosterPDFs/ICML%202022/1b0114c51cc532ed34e1954b5b9e4b58.png?t=1657508890.9486003", "slides": "", "author_site": "Chloe Hsu, Robert Verkuil, Jason Liu, Zeming Lin, Brian Hie, Tom Sercu, Adam Lerer, Alexander Rives", "author": "Chloe Hsu; Robert Verkuil; Jason Liu; Zeming Lin; Brian Hie; Tom Sercu; Adam Lerer; Alexander Rives", "abstract": "We consider the problem of predicting a protein sequence from its backbone atom coordinates. Machine learning approaches to this problem to date have been limited by the number of available experimentally determined protein structures. We augment training data by nearly three orders of magnitude by predicting structures for 12M protein sequences using AlphaFold2. Trained with this additional data, a sequence-to-sequence transformer with invariant geometric input processing layers achieves 51% native sequence recovery on structurally held-out backbones with 72% recovery for buried residues, an overall improvement of almost 10 percentage points over existing methods. The model generalizes to a variety of more complex tasks including design of protein complexes, partially masked structures, binding interfaces, and multiple states.", "bibtex": "@InProceedings{pmlr-v162-hsu22a,\n title = \t {Learning inverse folding from millions of predicted structures},\n author = {Hsu, Chloe and Verkuil, Robert and Liu, Jason and Lin, Zeming and Hie, Brian and Sercu, Tom and Lerer, Adam and Rives, Alexander},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8946--8970},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hsu22a/hsu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hsu22a.html},\n abstract = \t {We consider the problem of predicting a protein sequence from its backbone atom coordinates. Machine learning approaches to this problem to date have been limited by the number of available experimentally determined protein structures. We augment training data by nearly three orders of magnitude by predicting structures for 12M protein sequences using AlphaFold2. Trained with this additional data, a sequence-to-sequence transformer with invariant geometric input processing layers achieves 51% native sequence recovery on structurally held-out backbones with 72% recovery for buried residues, an overall improvement of almost 10 percentage points over existing methods. The model generalizes to a variety of more complex tasks including design of protein complexes, partially masked structures, binding interfaces, and multiple states.}\n}", "pdf": "https://proceedings.mlr.press/v162/hsu22a/hsu22a.pdf", "supp": "", "pdf_size": 3824267, "gs_citation": 475, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2813322174354285991&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of California, Berkeley + Facebook AI Research; Facebook AI Research; Facebook AI Research; Facebook AI Research + New York University; Facebook AI Research; Facebook AI Research; Facebook AI Research; Facebook AI Research", "aff_domain": "berkeley.edu; ; ; ; ; ;fb.com;fb.com", "email": "berkeley.edu; ; ; ; ; ;fb.com;fb.com", "github": "https://github.com/facebookresearch/esm", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/hsu22a.html", "aff_unique_index": "0+1;1;1;1+2;1;1;1;1", "aff_unique_norm": "University of California, Berkeley;Meta;New York University", "aff_unique_dep": ";Facebook AI Research;", "aff_unique_url": "https://www.berkeley.edu;https://research.facebook.com;https://www.nyu.edu", "aff_unique_abbr": "UC Berkeley;FAIR;NYU", "aff_campus_unique_index": "0;", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0+0;0;0;0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning of Cluster-based Feature Importance for Electronic Health Record Time-series", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17711", "id": "17711", "proceeding": "https://proceedings.mlr.press/v162/aguiar22a.html", "poster": "", "slides": "", "author_site": "Henrique Aguiar, Mauro Santos, Peter Watkinson, Tingting Zhu", "author": "Henrique Aguiar; Mauro Santos; Peter Watkinson; Tingting Zhu", "abstract": "The recent availability of Electronic Health Records (EHR) has allowed for the development of algorithms predicting inpatient risk of deterioration and trajectory evolution. However, prediction of disease progression with EHR is challenging since these data are sparse, heterogeneous, multi-dimensional, and multi-modal time-series. As such, clustering is regularly used to identify similar groups within the patient cohort to improve prediction. Current models have shown some success in obtaining cluster representations of patient trajectories. However, they i) fail to obtain clinical interpretability for each cluster, and ii) struggle to learn meaningful cluster numbers in the context of imbalanced distribution of disease outcomes. We propose a supervised deep learning model to cluster EHR data based on the identification of clinically understandable phenotypes with regard to both outcome prediction and patient trajectory. We introduce novel loss functions to address the problems of class imbalance and cluster collapse, and furthermore propose a feature-time attention mechanism to identify cluster-based phenotype importance across time and feature dimensions. We tested our model in two datasets corresponding to distinct medical settings. Our model yielded added interpretability to cluster formation and outperformed benchmarks by at least 4% in relevant metrics.", "bibtex": "@InProceedings{pmlr-v162-aguiar22a,\n title = \t {Learning of Cluster-based Feature Importance for Electronic Health Record Time-series},\n author = {Aguiar, Henrique and Santos, Mauro and Watkinson, Peter and Zhu, Tingting},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {161--179},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/aguiar22a/aguiar22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/aguiar22a.html},\n abstract = \t {The recent availability of Electronic Health Records (EHR) has allowed for the development of algorithms predicting inpatient risk of deterioration and trajectory evolution. However, prediction of disease progression with EHR is challenging since these data are sparse, heterogeneous, multi-dimensional, and multi-modal time-series. As such, clustering is regularly used to identify similar groups within the patient cohort to improve prediction. Current models have shown some success in obtaining cluster representations of patient trajectories. However, they i) fail to obtain clinical interpretability for each cluster, and ii) struggle to learn meaningful cluster numbers in the context of imbalanced distribution of disease outcomes. We propose a supervised deep learning model to cluster EHR data based on the identification of clinically understandable phenotypes with regard to both outcome prediction and patient trajectory. We introduce novel loss functions to address the problems of class imbalance and cluster collapse, and furthermore propose a feature-time attention mechanism to identify cluster-based phenotype importance across time and feature dimensions. We tested our model in two datasets corresponding to distinct medical settings. Our model yielded added interpretability to cluster formation and outperformed benchmarks by at least 4% in relevant metrics.}\n}", "pdf": "https://proceedings.mlr.press/v162/aguiar22a/aguiar22a.pdf", "supp": "", "pdf_size": 2769872, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12601017023404547475&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Engineering Science, University of Oxford, United Kingdom+Department of Engineering Science, University of Oxford, United Kingdom; Department of Engineering Science, University of Oxford, United Kingdom; Nuffield Department of Clinical Neurosciences, University of Oxford, United Kingdom; Department of Engineering Science, University of Oxford, United Kingdom", "aff_domain": "eng.ox.ac.uk; ; ; ", "email": "eng.ox.ac.uk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/aguiar22a.html", "aff_unique_index": "0+0;0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "Department of Engineering Science", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "0+0;0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Learning to Cut by Looking Ahead: Cutting Plane Selection via Imitation Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16909", "id": "16909", "proceeding": "https://proceedings.mlr.press/v162/paulus22a.html", "poster": "/media/PosterPDFs/ICML%202022/38a77aa456fc813af07bb428f2363c8d.png?t=1657181989.9913342", "slides": "", "author_site": "Max Paulus, Giulia Zarpellon, Andreas Krause, Laurent Charlin, Chris Maddison", "author": "Max B Paulus; Giulia Zarpellon; Andreas Krause; Laurent Charlin; Chris Maddison", "abstract": "Cutting planes are essential for solving mixed-integer linear problems (MILPs), because they facilitate bound improvements on the optimal solution value. For selecting cuts, modern solvers rely on manually designed heuristics that are tuned to gauge the potential effectiveness of cuts. We show that a greedy selection rule explicitly looking ahead to select cuts that yield the best bound improvement delivers strong decisions for cut selection \u2013 but is too expensive to be deployed in practice. In response, we propose a new neural architecture (NeuralCut) for imitation learning on the lookahead expert. Our model outperforms standard baselines for cut selection on several synthetic MILP benchmarks. Experiments on a realistic B&C solver further validate our approach, and exhibit the potential of learning methods in this setting.", "bibtex": "@InProceedings{pmlr-v162-paulus22a,\n title = \t {Learning to Cut by Looking Ahead: Cutting Plane Selection via Imitation Learning},\n author = {Paulus, Max B and Zarpellon, Giulia and Krause, Andreas and Charlin, Laurent and Maddison, Chris},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17584--17600},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/paulus22a/paulus22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/paulus22a.html},\n abstract = \t {Cutting planes are essential for solving mixed-integer linear problems (MILPs), because they facilitate bound improvements on the optimal solution value. For selecting cuts, modern solvers rely on manually designed heuristics that are tuned to gauge the potential effectiveness of cuts. We show that a greedy selection rule explicitly looking ahead to select cuts that yield the best bound improvement delivers strong decisions for cut selection \u2013 but is too expensive to be deployed in practice. In response, we propose a new neural architecture (NeuralCut) for imitation learning on the lookahead expert. Our model outperforms standard baselines for cut selection on several synthetic MILP benchmarks. Experiments on a realistic B&C solver further validate our approach, and exhibit the potential of learning methods in this setting.}\n}", "pdf": "https://proceedings.mlr.press/v162/paulus22a/paulus22a.pdf", "supp": "", "pdf_size": 1337462, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17009920548839863294&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, ETH Z\u00fcrich, Z\u00fcrich, Switzerland; Vector Institute, Toronto, Canada; Department of Computer Science, ETH Z\u00fcrich, Z\u00fcrich, Switzerland; Department of Decision Sciences, HEC Montr\u00e9al, Montr\u00e9al, Canada+Mila, Montr\u00e9al, Canada; Department of Computer Science and Department of Statistical Sciences, University of Toronto, Toronto, Canada", "aff_domain": "inf.ethz.ch;cs.toronto.edu; ;hec.ca;cs.toronto.edu", "email": "inf.ethz.ch;cs.toronto.edu; ;hec.ca;cs.toronto.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/paulus22a.html", "aff_unique_index": "0;1;0;2+3;4", "aff_unique_norm": "ETH Zurich;Vector Institute;HEC Montr\u00e9al;Mila;University of Toronto", "aff_unique_dep": "Department of Computer Science;;Department of Decision Sciences;;Department of Computer Science", "aff_unique_url": "https://www.ethz.ch;https://vectorinstitute.ai;https://www.hec.ca;https://mila.quebec;https://www.utoronto.ca", "aff_unique_abbr": "ETHZ;Vector Institute;HEC;Mila;U of T", "aff_campus_unique_index": "0;1;0;2+2;1", "aff_campus_unique": "Z\u00fcrich;Toronto;Montr\u00e9al", "aff_country_unique_index": "0;1;0;1+1;1", "aff_country_unique": "Switzerland;Canada" }, { "title": "Learning to Estimate and Refine Fluid Motion with Physical Dynamics", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17519", "id": "17519", "proceeding": "https://proceedings.mlr.press/v162/zhang22ad.html", "poster": "/media/PosterPDFs/ICML%202022/922073b18844540f8fe447c3e93a25b7.png?t=1658130454.709844", "slides": "", "author_site": "Mingrui Zhang, Jianhong Wang, James Tlhomole, Matthew Piggott", "author": "Mingrui Zhang; Jianhong Wang; James B Tlhomole; Matthew Piggott", "abstract": "Extracting information on fluid motion directly from images is challenging. Fluid flow represents a complex dynamic system governed by the Navier-Stokes equations. General optical flow methods are typically designed for rigid body motion, and thus struggle if applied to fluid motion estimation directly. Further, optical flow methods only focus on two consecutive frames without utilising historical temporal information, while the fluid motion (velocity field) can be considered a continuous trajectory constrained by time-dependent partial differential equations (PDEs). This discrepancy has the potential to induce physically inconsistent estimations. Here we propose an unsupervised learning based prediction-correction scheme for fluid flow estimation. An estimate is first given by a PDE-constrained optical flow predictor, which is then refined by a physical based corrector. The proposed approach outperforms optical flow methods and shows competitive results compared to existing supervised learning based methods on a benchmark dataset. Furthermore, the proposed approach can generalize to complex real-world fluid scenarios where ground truth information is effectively unknowable. Finally, experiments demonstrate that the physical corrector can refine flow estimates by mimicking the operator splitting method commonly utilised in fluid dynamical simulation.", "bibtex": "@InProceedings{pmlr-v162-zhang22ad,\n title = \t {Learning to Estimate and Refine Fluid Motion with Physical Dynamics},\n author = {Zhang, Mingrui and Wang, Jianhong and Tlhomole, James B and Piggott, Matthew},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26575--26590},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22ad/zhang22ad.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22ad.html},\n abstract = \t {Extracting information on fluid motion directly from images is challenging. Fluid flow represents a complex dynamic system governed by the Navier-Stokes equations. General optical flow methods are typically designed for rigid body motion, and thus struggle if applied to fluid motion estimation directly. Further, optical flow methods only focus on two consecutive frames without utilising historical temporal information, while the fluid motion (velocity field) can be considered a continuous trajectory constrained by time-dependent partial differential equations (PDEs). This discrepancy has the potential to induce physically inconsistent estimations. Here we propose an unsupervised learning based prediction-correction scheme for fluid flow estimation. An estimate is first given by a PDE-constrained optical flow predictor, which is then refined by a physical based corrector. The proposed approach outperforms optical flow methods and shows competitive results compared to existing supervised learning based methods on a benchmark dataset. Furthermore, the proposed approach can generalize to complex real-world fluid scenarios where ground truth information is effectively unknowable. Finally, experiments demonstrate that the physical corrector can refine flow estimates by mimicking the operator splitting method commonly utilised in fluid dynamical simulation.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22ad/zhang22ad.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/zhang22ad-supp.zip", "pdf_size": 15112429, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7117659598027113757&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Earth Science and Engineering, Imperial College London, UK; Department of Electrical and Electronic Engineering, Imperial College London, UK; Department of Earth Science and Engineering, Imperial College London, UK; Department of Earth Science and Engineering, Imperial College London, UK", "aff_domain": "imperial.ac.uk; ; ; ", "email": "imperial.ac.uk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhang22ad.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "Department of Earth Science and Engineering", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "Imperial", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Learning to Hash Robustly, Guaranteed", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15991", "id": "15991", "proceeding": "https://proceedings.mlr.press/v162/andoni22a.html", "poster": "/media/PosterPDFs/ICML%202022/3a15c7d0bbe60300a39f76f8a5ba6896_PXypuxV.png?t=1656707776.2659233", "slides": "", "author_site": "Alexandr Andoni, Daniel Beaglehole", "author": "Alexandr Andoni; Daniel Beaglehole", "abstract": "The indexing algorithms for the high-dimensional nearest neighbor search (NNS) with the best worst-case guarantees are based on the randomized Locality Sensitive Hashing (LSH), and its derivatives. In practice, many heuristic approaches exist to \"learn\" the best indexing method in order to speed-up NNS, crucially adapting to the structure of the given dataset. Oftentimes, these heuristics outperform the LSH-based algorithms on real datasets, but, almost always, come at the cost of losing the guarantees of either correctness or robust performance on adversarial queries, or apply to datasets with an assumed extra structure/model. In this paper, we design an NNS algorithm for the Hamming space that has worst-case guarantees essentially matching that of theoretical algorithms, while optimizing the hashing to the structure of the dataset (think instance-optimal algorithms) for performance on the minimum-performing query. We evaluate the algorithm\u2019s ability to optimize for a given dataset both theoretically and practically. On the theoretical side, we exhibit a natural setting (dataset model) where our algorithm is much better than the standard theoretical one. On the practical side, we run experiments that show that our algorithm has a 1.8x and 2.1x better recall on the worst-performing queries to the MNIST and ImageNet datasets.", "bibtex": "@InProceedings{pmlr-v162-andoni22a,\n title = \t {Learning to Hash Robustly, Guaranteed},\n author = {Andoni, Alexandr and Beaglehole, Daniel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {599--618},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/andoni22a/andoni22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/andoni22a.html},\n abstract = \t {The indexing algorithms for the high-dimensional nearest neighbor search (NNS) with the best worst-case guarantees are based on the randomized Locality Sensitive Hashing (LSH), and its derivatives. In practice, many heuristic approaches exist to \"learn\" the best indexing method in order to speed-up NNS, crucially adapting to the structure of the given dataset. Oftentimes, these heuristics outperform the LSH-based algorithms on real datasets, but, almost always, come at the cost of losing the guarantees of either correctness or robust performance on adversarial queries, or apply to datasets with an assumed extra structure/model. In this paper, we design an NNS algorithm for the Hamming space that has worst-case guarantees essentially matching that of theoretical algorithms, while optimizing the hashing to the structure of the dataset (think instance-optimal algorithms) for performance on the minimum-performing query. We evaluate the algorithm\u2019s ability to optimize for a given dataset both theoretically and practically. On the theoretical side, we exhibit a natural setting (dataset model) where our algorithm is much better than the standard theoretical one. On the practical side, we run experiments that show that our algorithm has a 1.8x and 2.1x better recall on the worst-performing queries to the MNIST and ImageNet datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/andoni22a/andoni22a.pdf", "supp": "", "pdf_size": 567095, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4654828841040680818&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Computer Science and Engineering, UCSD, La Jolla, California; Department of Computer Science, Columbia University, New York, New York", "aff_domain": "ucsd.edu;ucsd.edu", "email": "ucsd.edu;ucsd.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/andoni22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of California, San Diego;Columbia University", "aff_unique_dep": "Computer Science and Engineering;Department of Computer Science", "aff_unique_url": "https://www.ucsd.edu;https://www.columbia.edu", "aff_unique_abbr": "UCSD;Columbia", "aff_campus_unique_index": "0;1", "aff_campus_unique": "La Jolla;New York", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning to Incorporate Texture Saliency Adaptive Attention to Image Cartoonization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18021", "id": "18021", "proceeding": "https://proceedings.mlr.press/v162/gao22k.html", "poster": "/media/PosterPDFs/ICML%202022/d707329bece455a462b58ce00d1194c9.png?t=1657217930.1830206", "slides": "", "author_site": "Xiang Gao, Yuqi Zhang, Yingjie Tian", "author": "Xiang Gao; Yuqi Zhang; Yingjie Tian", "abstract": "Image cartoonization is recently dominated by generative adversarial networks (GANs) from the perspective of unsupervised image-to-image translation, in which an inherent challenge is to precisely capture and sufficiently transfer characteristic cartoon styles (e.g., clear edges, smooth color shading, vivid colors, etc.). Existing advanced models try to enhance cartoonization effect by learning to promote edges adversarially, introducing style transfer loss, or learning to align style from multiple representation space. This paper demonstrates that more distinct and vivid cartoonization effect could be easily achieved with only basic adversarial loss. Observing that cartoon style is more evident in cartoon-texture-salient local image regions, we build a region-level adversarial learning branch in parallel with the normal image-level one, which constrains adversarial learning on cartoon-texture-salient local patches for better perceiving and transferring cartoon texture features. To this end, a novel cartoon-texture-saliency-sampler (CTSS) module is proposed to adaptively sample cartoon-texture-salient patches from training data. We present that such texture saliency adaptive attention is of significant importance in facilitating and enhancing cartoon stylization, which is a key missing ingredient of related methods. The superiority of our model in promoting cartoonization effect, especially for high-resolution input images, are fully demonstrated with extensive experiments.", "bibtex": "@InProceedings{pmlr-v162-gao22k,\n title = \t {Learning to Incorporate Texture Saliency Adaptive Attention to Image Cartoonization},\n author = {Gao, Xiang and Zhang, Yuqi and Tian, Yingjie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7183--7207},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gao22k/gao22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/gao22k.html},\n abstract = \t {Image cartoonization is recently dominated by generative adversarial networks (GANs) from the perspective of unsupervised image-to-image translation, in which an inherent challenge is to precisely capture and sufficiently transfer characteristic cartoon styles (e.g., clear edges, smooth color shading, vivid colors, etc.). Existing advanced models try to enhance cartoonization effect by learning to promote edges adversarially, introducing style transfer loss, or learning to align style from multiple representation space. This paper demonstrates that more distinct and vivid cartoonization effect could be easily achieved with only basic adversarial loss. Observing that cartoon style is more evident in cartoon-texture-salient local image regions, we build a region-level adversarial learning branch in parallel with the normal image-level one, which constrains adversarial learning on cartoon-texture-salient local patches for better perceiving and transferring cartoon texture features. To this end, a novel cartoon-texture-saliency-sampler (CTSS) module is proposed to adaptively sample cartoon-texture-salient patches from training data. We present that such texture saliency adaptive attention is of significant importance in facilitating and enhancing cartoon stylization, which is a key missing ingredient of related methods. The superiority of our model in promoting cartoonization effect, especially for high-resolution input images, are fully demonstrated with extensive experiments.}\n}", "pdf": "https://proceedings.mlr.press/v162/gao22k/gao22k.pdf", "supp": "", "pdf_size": 32209753, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11484326183315995757&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "University of Chinese Academy of Sciences+Noah\u2019s Ark Lab, Huawei Technologies; University of Chinese Academy of Sciences; University of Chinese Academy of Sciences", "aff_domain": "ucas.ac.cn;ucas.ac.cn;ucas.ac.cn", "email": "ucas.ac.cn;ucas.ac.cn;ucas.ac.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/gao22k.html", "aff_unique_index": "0+1;0;0", "aff_unique_norm": "University of Chinese Academy of Sciences;Huawei", "aff_unique_dep": ";Noah\u2019s Ark Lab", "aff_unique_url": "http://www.ucas.ac.cn;https://www.huawei.com", "aff_unique_abbr": "UCAS;Huawei", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "China" }, { "title": "Learning to Infer Structures of Network Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16403", "id": "16403", "proceeding": "https://proceedings.mlr.press/v162/rossi22a.html", "poster": "", "slides": "", "author_site": "Emanuele Rossi, Federico Monti, Yan Leng, Michael Bronstein, Xiaowen Dong", "author": "Emanuele Rossi; Federico Monti; Yan Leng; Michael Bronstein; Xiaowen Dong", "abstract": "Strategic interactions between a group of individuals or organisations can be modelled as games played on networks, where a player\u2019s payoff depends not only on their actions but also on those of their neighbours. Inferring the network structure from observed game outcomes (equilibrium actions) is an important problem with numerous potential applications in economics and social sciences. Existing methods mostly require the knowledge of the utility function associated with the game, which is often unrealistic to obtain in real-world scenarios. We adopt a transformer-like architecture which correctly accounts for the symmetries of the problem and learns a mapping from the equilibrium actions to the network structure of the game without explicit knowledge of the utility function. We test our method on three different types of network games using both synthetic and real-world data, and demonstrate its effectiveness in network structure inference and superior performance over existing methods.", "bibtex": "@InProceedings{pmlr-v162-rossi22a,\n title = \t {Learning to Infer Structures of Network Games},\n author = {Rossi, Emanuele and Monti, Federico and Leng, Yan and Bronstein, Michael and Dong, Xiaowen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18809--18827},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rossi22a/rossi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rossi22a.html},\n abstract = \t {Strategic interactions between a group of individuals or organisations can be modelled as games played on networks, where a player\u2019s payoff depends not only on their actions but also on those of their neighbours. Inferring the network structure from observed game outcomes (equilibrium actions) is an important problem with numerous potential applications in economics and social sciences. Existing methods mostly require the knowledge of the utility function associated with the game, which is often unrealistic to obtain in real-world scenarios. We adopt a transformer-like architecture which correctly accounts for the symmetries of the problem and learns a mapping from the equilibrium actions to the network structure of the game without explicit knowledge of the utility function. We test our method on three different types of network games using both synthetic and real-world data, and demonstrate its effectiveness in network structure inference and superior performance over existing methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/rossi22a/rossi22a.pdf", "supp": "", "pdf_size": 857176, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16150914369122606457&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Twitter, London, UK+Imperial College London, London, UK; Twitter, London, UK+Imperial College London, London, UK; The University of Texas at Austin, Austin, TX, USA; Twitter, London, UK+Imperial College London, London, UK+University of Oxford, Oxford, UK; University of Oxford, Oxford, UK", "aff_domain": "gmail.com; ; ; ; ", "email": "gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/rossi22a.html", "aff_unique_index": "0+1;0+1;2;0+1+3;3", "aff_unique_norm": "Twitter;Imperial College London;University of Texas at Austin;University of Oxford", "aff_unique_dep": ";;;", "aff_unique_url": "https://twitter.com;https://www.imperial.ac.uk;https://www.utexas.edu;https://www.ox.ac.uk", "aff_unique_abbr": "Twitter;ICL;UT Austin;Oxford", "aff_campus_unique_index": "0+0;0+0;1;0+0+2;2", "aff_campus_unique": "London;Austin;Oxford", "aff_country_unique_index": "0+0;0+0;1;0+0+0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Learning to Predict Graphs with Fused Gromov-Wasserstein Barycenters", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17999", "id": "17999", "proceeding": "https://proceedings.mlr.press/v162/brogat-motte22a.html", "poster": "/media/PosterPDFs/ICML%202022/84438b7aae55a0638073ef798e50b4ef_TXuJp0u.png?t=1656596674.352062", "slides": "", "author_site": "Luc Brogat-Motte, R\u00e9mi Flamary, Celine Brouard, Juho Rousu, Florence d'Alch\u00e9-Buc", "author": "Luc Brogat-Motte; R\u00e9mi Flamary; Celine Brouard; Juho Rousu; Florence D\u2019Alch\u00e9-Buc", "abstract": "This paper introduces a novel and generic framework to solve the flagship task of supervised labeled graph prediction by leveraging Optimal Transport tools. We formulate the problem as regression with the Fused Gromov-Wasserstein (FGW) loss and propose a predictive model relying on a FGW barycenter whose weights depend on inputs. First we introduce a non-parametric estimator based on kernel ridge regression for which theoretical results such as consistency and excess risk bound are proved. Next we propose an interpretable parametric model where the barycenter weights are modeled with a neural network and the graphs on which the FGW barycenter is calculated are additionally learned. Numerical experiments show the strength of the method and its ability to interpolate in the labeled graph space on simulated data and on a difficult metabolic identification problem where it can reach very good performance with very little engineering.", "bibtex": "@InProceedings{pmlr-v162-brogat-motte22a,\n title = \t {Learning to Predict Graphs with Fused Gromov-{W}asserstein Barycenters},\n author = {Brogat-Motte, Luc and Flamary, R{\\'e}mi and Brouard, Celine and Rousu, Juho and D'Alch{\\'e}-Buc, Florence},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2321--2335},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/brogat-motte22a/brogat-motte22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/brogat-motte22a.html},\n abstract = \t {This paper introduces a novel and generic framework to solve the flagship task of supervised labeled graph prediction by leveraging Optimal Transport tools. We formulate the problem as regression with the Fused Gromov-Wasserstein (FGW) loss and propose a predictive model relying on a FGW barycenter whose weights depend on inputs. First we introduce a non-parametric estimator based on kernel ridge regression for which theoretical results such as consistency and excess risk bound are proved. Next we propose an interpretable parametric model where the barycenter weights are modeled with a neural network and the graphs on which the FGW barycenter is calculated are additionally learned. Numerical experiments show the strength of the method and its ability to interpolate in the labeled graph space on simulated data and on a difficult metabolic identification problem where it can reach very good performance with very little engineering.}\n}", "pdf": "https://proceedings.mlr.press/v162/brogat-motte22a/brogat-motte22a.pdf", "supp": "", "pdf_size": 1638626, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=449987462895486157&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "LTCI, T\u00e9l\u00e9com Paris, Institut Polytechnique de Paris, France; Ecole Polytechnique, Institut Polytechnique de Paris, CMAP, UMR 7641, Palaiseau, France; Universit\u00e9 de Toulouse, INRAE, UR MIAT, France; Department of Computer Science, Aalto University, Finland; LTCI, T\u00e9l\u00e9com Paris, Institut Polytechnique de Paris, France", "aff_domain": "telecom-paris.fr; ; ; ; ", "email": "telecom-paris.fr; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/brogat-motte22a.html", "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "T\u00e9l\u00e9com Paris;Ecole Polytechnique;Universit\u00e9 de Toulouse;Aalto University", "aff_unique_dep": "LTCI;CMAP, UMR 7641;INRAE, UR MIAT;Department of Computer Science", "aff_unique_url": "https://www.telecom-paris.fr;https://www.ecp.fr;https://www.univ-toulouse.fr;https://www.aalto.fi", "aff_unique_abbr": "T\u00e9l\u00e9com Paris;Ecole Polytechnique;UT;Aalto", "aff_campus_unique_index": "1", "aff_campus_unique": ";Palaiseau", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "France;Finland" }, { "title": "Learning to Separate Voices by Spatial Regions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16945", "id": "16945", "proceeding": "https://proceedings.mlr.press/v162/xu22b.html", "poster": "/media/PosterPDFs/ICML%202022/517da335fd0ec2f4a25ea139d5494163.png?t=1657795840.1206164", "slides": "", "author_site": "Zhongweiyang Xu, Romit Roy Choudhury", "author": "Alan Xu; Romit Roy Choudhury", "abstract": "We consider the problem of audio voice separation for binaural applications, such as earphones and hearing aids. While today\u2019s neural networks perform remarkably well (separating 4+ sources with 2 microphones) they assume a known or fixed maximum number of sources, K. Moreover, today\u2019s models are trained in a supervised manner, using training data synthesized from generic sources, environments, and human head shapes. This paper intends to relax both these constraints at the expense of a slight alteration in the problem definition. We observe that, when a received mixture contains too many sources, it is still helpful to separate them by region, i.e., isolating signal mixtures from each conical sector around the user\u2019s head. This requires learning the fine-grained spatial properties of each region, including the signal distortions imposed by a person\u2019s head. We propose a two-stage self-supervised framework in which overheard voices from earphones are pre-processed to extract relatively clean personalized signals, which are then used to train a region-wise separation model. Results show promising performance, underscoring the importance of personalization over a generic supervised approach. (audio samples available at our project website: https://uiuc-earable-computing.github.io/binaural). We believe this result could help real-world applications in selective hearing, noise cancellation, and audio augmented reality.", "bibtex": "@InProceedings{pmlr-v162-xu22b,\n title = \t {Learning to Separate Voices by Spatial Regions},\n author = {Xu, Alan and Choudhury, Romit Roy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24539--24549},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22b/xu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22b.html},\n abstract = \t {We consider the problem of audio voice separation for binaural applications, such as earphones and hearing aids. While today\u2019s neural networks perform remarkably well (separating 4+ sources with 2 microphones) they assume a known or fixed maximum number of sources, K. Moreover, today\u2019s models are trained in a supervised manner, using training data synthesized from generic sources, environments, and human head shapes. This paper intends to relax both these constraints at the expense of a slight alteration in the problem definition. We observe that, when a received mixture contains too many sources, it is still helpful to separate them by region, i.e., isolating signal mixtures from each conical sector around the user\u2019s head. This requires learning the fine-grained spatial properties of each region, including the signal distortions imposed by a person\u2019s head. We propose a two-stage self-supervised framework in which overheard voices from earphones are pre-processed to extract relatively clean personalized signals, which are then used to train a region-wise separation model. Results show promising performance, underscoring the importance of personalization over a generic supervised approach. (audio samples available at our project website: https://uiuc-earable-computing.github.io/binaural). We believe this result could help real-world applications in selective hearing, noise cancellation, and audio augmented reality.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22b/xu22b.pdf", "supp": "", "pdf_size": 944407, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17940568416667742306&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Electrical and Computer Engineering, University of Illinois Urbana-Champaign, Illinois, US; Department of Electrical and Computer Engineering, University of Illinois Urbana-Champaign, Illinois, US", "aff_domain": "illinois.edu;illinois.edu", "email": "illinois.edu;illinois.edu", "github": "", "project": "https://uiuc-earable-computing.github.io/binaural/", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/xu22b.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning to Solve PDE-constrained Inverse Problems with Graph Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16565", "id": "16565", "proceeding": "https://proceedings.mlr.press/v162/zhao22d.html", "poster": "/media/PosterPDFs/ICML%202022/c7635bfd99248a2cdef8249ef7bfbef4.png?t=1657669507.7380304", "slides": "", "author_site": "QINGQING ZHAO, David B. Lindell, Gordon Wetzstein", "author": "Qingqing Zhao; David B Lindell; Gordon Wetzstein", "abstract": "Learned graph neural networks (GNNs) have recently been established as fast and accurate alternatives for principled solvers in simulating the dynamics of physical systems. In many application domains across science and engineering, however, we are not only interested in a forward simulation but also in solving inverse problems with constraints defined by a partial differential equation (PDE). Here we explore GNNs to solve such PDE-constrained inverse problems. Given a sparse set of measurements, we are interested in recovering the initial condition or parameters of the PDE. We demonstrate that GNNs combined with autodecoder-style priors are well-suited for these tasks, achieving more accurate estimates of initial conditions or physical parameters than other learned approaches when applied to the wave equation or Navier Stokes equations. We also demonstrate computational speedups of up to 90x using GNNs compared to principled solvers.", "bibtex": "@InProceedings{pmlr-v162-zhao22d,\n title = \t {Learning to Solve {PDE}-constrained Inverse Problems with Graph Networks},\n author = {Zhao, Qingqing and Lindell, David B and Wetzstein, Gordon},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26895--26910},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhao22d/zhao22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhao22d.html},\n abstract = \t {Learned graph neural networks (GNNs) have recently been established as fast and accurate alternatives for principled solvers in simulating the dynamics of physical systems. In many application domains across science and engineering, however, we are not only interested in a forward simulation but also in solving inverse problems with constraints defined by a partial differential equation (PDE). Here we explore GNNs to solve such PDE-constrained inverse problems. Given a sparse set of measurements, we are interested in recovering the initial condition or parameters of the PDE. We demonstrate that GNNs combined with autodecoder-style priors are well-suited for these tasks, achieving more accurate estimates of initial conditions or physical parameters than other learned approaches when applied to the wave equation or Navier Stokes equations. We also demonstrate computational speedups of up to 90x using GNNs compared to principled solvers.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhao22d/zhao22d.pdf", "supp": "", "pdf_size": 29201684, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1204890943503575289&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Stanford University; Stanford University; Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu", "email": "stanford.edu;stanford.edu;stanford.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhao22d.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning-based Optimisation of Particle Accelerators Under Partial Observability Without Real-World Training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16985", "id": "16985", "proceeding": "https://proceedings.mlr.press/v162/kaiser22a.html", "poster": "/media/PosterPDFs/ICML%202022/ef72d53990bc4805684c9b61fa64a102.png?t=1657639050.2093298", "slides": "", "author_site": "Jan Kaiser, Oliver Stein, Annika Eichler", "author": "Jan Kaiser; Oliver Stein; Annika Eichler", "abstract": "In recent work, it has been shown that reinforcement learning (RL) is capable of solving a variety of problems at sometimes super-human performance levels. But despite continued advances in the field, applying RL to complex real-world control and optimisation problems has proven difficult. In this contribution, we demonstrate how to successfully apply RL to the optimisation of a highly complex real-world machine {\u2013} specifically a linear particle accelerator {\u2013} in an only partially observable setting and without requiring training on the real machine. Our method outperforms conventional optimisation algorithms in both the achieved result and time taken as well as already achieving close to human-level performance. We expect that such automation of machine optimisation will push the limits of operability, increase machine availability and lead to a paradigm shift in how such machines are operated, ultimately facilitating advances in a variety of fields, such as science and medicine among many others.", "bibtex": "@InProceedings{pmlr-v162-kaiser22a,\n title = \t {Learning-based Optimisation of Particle Accelerators Under Partial Observability Without Real-World Training},\n author = {Kaiser, Jan and Stein, Oliver and Eichler, Annika},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10575--10585},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kaiser22a/kaiser22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kaiser22a.html},\n abstract = \t {In recent work, it has been shown that reinforcement learning (RL) is capable of solving a variety of problems at sometimes super-human performance levels. But despite continued advances in the field, applying RL to complex real-world control and optimisation problems has proven difficult. In this contribution, we demonstrate how to successfully apply RL to the optimisation of a highly complex real-world machine {\u2013} specifically a linear particle accelerator {\u2013} in an only partially observable setting and without requiring training on the real machine. Our method outperforms conventional optimisation algorithms in both the achieved result and time taken as well as already achieving close to human-level performance. We expect that such automation of machine optimisation will push the limits of operability, increase machine availability and lead to a paradigm shift in how such machines are operated, ultimately facilitating advances in a variety of fields, such as science and medicine among many others.}\n}", "pdf": "https://proceedings.mlr.press/v162/kaiser22a/kaiser22a.pdf", "supp": "", "pdf_size": 6137889, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17601146277289124838&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Deutsches Elektronen-Synchrotron DESY; Deutsches Elektronen-Synchrotron DESY; Deutsches Elektronen-Synchrotron DESY", "aff_domain": "desy.de; ; ", "email": "desy.de; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kaiser22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Deutsches Elektronen-Synchrotron", "aff_unique_dep": "", "aff_unique_url": "https://www.desy.de", "aff_unique_abbr": "DESY", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Least Squares Estimation using Sketched Data with Heteroskedastic Errors", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16241", "id": "16241", "proceeding": "https://proceedings.mlr.press/v162/lee22i.html", "poster": "/media/PosterPDFs/ICML%202022/3fd60983292458bf7dee75f12d5e9e05.png?t=1658358301.9784892", "slides": "/media/icml-2022/Slides/16241.pdf", "author_site": "Sokbae Lee, Serena Ng", "author": "Sokbae Lee; Serena Ng", "abstract": "Researchers may perform regressions using a sketch of data of size m instead of the full sample of size n for a variety of reasons. This paper considers the case when the regression errors do not have constant variance and heteroskedasticity robust standard errors would normally be needed for test statistics to provide accurate inference. We show that estimates using data sketched by random projections will behave \u2019as if\u2019 the errors were homoskedastic. Estimation by random sampling would not have this property. The result arises because the sketched estimates in the case of random projections can be expressed as degenerate U-statistics, and under certain conditions, these statistics are asymptotically normal with homoskedastic variance. We verify that the conditions hold not only in the case of least squares regression when the covariates are exogenous, but also in instrumental variables estimation when the covariates are endogenous. The result implies that inference can be simpler than the full sample case if the sketching scheme is appropriately chosen.", "bibtex": "@InProceedings{pmlr-v162-lee22i,\n title = \t {Least Squares Estimation using Sketched Data with Heteroskedastic Errors},\n author = {Lee, Sokbae and Ng, Serena},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12498--12520},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lee22i/lee22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/lee22i.html},\n abstract = \t {Researchers may perform regressions using a sketch of data of size m instead of the full sample of size n for a variety of reasons. This paper considers the case when the regression errors do not have constant variance and heteroskedasticity robust standard errors would normally be needed for test statistics to provide accurate inference. We show that estimates using data sketched by random projections will behave \u2019as if\u2019 the errors were homoskedastic. Estimation by random sampling would not have this property. The result arises because the sketched estimates in the case of random projections can be expressed as degenerate U-statistics, and under certain conditions, these statistics are asymptotically normal with homoskedastic variance. We verify that the conditions hold not only in the case of least squares regression when the covariates are exogenous, but also in instrumental variables estimation when the covariates are endogenous. The result implies that inference can be simpler than the full sample case if the sketching scheme is appropriately chosen.}\n}", "pdf": "https://proceedings.mlr.press/v162/lee22i/lee22i.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/lee22i-supp.zip", "pdf_size": 371734, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2973545111138164523&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Economics, Columbia University, New York, USA; Department of Economics, Columbia University, New York, USA", "aff_domain": "columbia.edu;columbia.edu", "email": "columbia.edu;columbia.edu", "github": "", "project": "https://cran.r-project.org/", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/lee22i.html", "aff_unique_index": "0;0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "Department of Economics", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Let Invariant Rationale Discovery Inspire Graph Contrastive Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17737", "id": "17737", "proceeding": "https://proceedings.mlr.press/v162/li22v.html", "poster": "/media/PosterPDFs/ICML%202022/067ee197a2aa979778923af77b40dd89.png?t=1657247692.97663", "slides": "", "author_site": "Sihang Li, Xiang Wang, An Zhang, Ying-Xin Wu, Xiangnan He, Tat-Seng Chua", "author": "Sihang Li; Xiang Wang; An Zhang; Yingxin Wu; Xiangnan He; Tat-Seng Chua", "abstract": "Leading graph contrastive learning (GCL) methods perform graph augmentations in two fashions: (1) randomly corrupting the anchor graph, which could cause the loss of semantic information, or (2) using domain knowledge to maintain salient features, which undermines the generalization to other domains. Taking an invariance look at GCL, we argue that a high-performing augmentation should preserve the salient semantics of anchor graphs regarding instance-discrimination. To this end, we relate GCL with invariant rationale discovery, and propose a new framework, Rationale-aware Graph Contrastive Learning (RGCL). Specifically, without supervision signals, RGCL uses a rationale generator to reveal salient features about graph instance-discrimination as the rationale, and then creates rationale-aware views for contrastive learning. This rationale-aware pre-training scheme endows the backbone model with the powerful representation ability, further facilitating the fine-tuning on downstream tasks. On MNIST-Superpixel and MUTAG datasets, visual inspections on the discovered rationales showcase that the rationale generator successfully captures the salient features (\\ie distinguishing semantic nodes in graphs). On biochemical molecule and social network benchmark datasets, the state-of-the-art performance of RGCL demonstrates the effectiveness of rationale-aware views for contrastive learning. Our codes are available at https://github.com/lsh0520/RGCL.", "bibtex": "@InProceedings{pmlr-v162-li22v,\n title = \t {Let Invariant Rationale Discovery Inspire Graph Contrastive Learning},\n author = {Li, Sihang and Wang, Xiang and Zhang, An and Wu, Yingxin and He, Xiangnan and Chua, Tat-Seng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13052--13065},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22v/li22v.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22v.html},\n abstract = \t {Leading graph contrastive learning (GCL) methods perform graph augmentations in two fashions: (1) randomly corrupting the anchor graph, which could cause the loss of semantic information, or (2) using domain knowledge to maintain salient features, which undermines the generalization to other domains. Taking an invariance look at GCL, we argue that a high-performing augmentation should preserve the salient semantics of anchor graphs regarding instance-discrimination. To this end, we relate GCL with invariant rationale discovery, and propose a new framework, Rationale-aware Graph Contrastive Learning (RGCL). Specifically, without supervision signals, RGCL uses a rationale generator to reveal salient features about graph instance-discrimination as the rationale, and then creates rationale-aware views for contrastive learning. This rationale-aware pre-training scheme endows the backbone model with the powerful representation ability, further facilitating the fine-tuning on downstream tasks. On MNIST-Superpixel and MUTAG datasets, visual inspections on the discovered rationales showcase that the rationale generator successfully captures the salient features (\\ie distinguishing semantic nodes in graphs). On biochemical molecule and social network benchmark datasets, the state-of-the-art performance of RGCL demonstrates the effectiveness of rationale-aware views for contrastive learning. Our codes are available at https://github.com/lsh0520/RGCL.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22v/li22v.pdf", "supp": "", "pdf_size": 1276268, "gs_citation": 138, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13286040992676917455&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "School of Information Science and Technology, University of Science and Technology of China, Hefei, China; School of Cyber Science and Technology, University of Science and Technology of China, Hefei, China; Sea-NExT Joint Lab, National University of Singapore, Singapore; School of Data Science, University of Science and Technology of China, Hefei, China; School of Data Science, University of Science and Technology of China, Hefei, China; Sea-NExT Joint Lab, National University of Singapore, Singapore", "aff_domain": "ustc.edu.cn;gmail.com;u.nus.edu;ustc.edu.cn;gmail.com;u.nus.edu", "email": "ustc.edu.cn;gmail.com;u.nus.edu;ustc.edu.cn;gmail.com;u.nus.edu", "github": "https://github.com/lsh0520/RGCL", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/li22v.html", "aff_unique_index": "0;0;1;0;0;1", "aff_unique_norm": "University of Science and Technology of China;National University of Singapore", "aff_unique_dep": "School of Information Science and Technology;Sea-NExT Joint Lab", "aff_unique_url": "http://www.ustc.edu.cn;https://www.nus.edu.sg", "aff_unique_abbr": "USTC;NUS", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hefei;", "aff_country_unique_index": "0;0;1;0;0;1", "aff_country_unique": "China;Singapore" }, { "title": "Leverage Score Sampling for Tensor Product Matrices in Input Sparsity Time", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16935", "id": "16935", "proceeding": "https://proceedings.mlr.press/v162/woodruff22a.html", "poster": "/media/PosterPDFs/ICML%202022/d4df7b6239c425d8cc897411ef11abe7_4sB718K.png?t=1657712329.9347289", "slides": "", "author_site": "David Woodruff, Amir Zandieh", "author": "David Woodruff; Amir Zandieh", "abstract": "We propose an input sparsity time sampling algorithm that can spectrally approximate the Gram matrix corresponding to the q-fold column-wise tensor product of q matrices using a nearly optimal number of samples, improving upon all previously known methods by poly(q) factors. Furthermore, for the important special case of the q-fold self-tensoring of a dataset, which is the feature matrix of the degree-q polynomial kernel, the leading term of our method\u2019s runtime is proportional to the size of the dataset and has no dependence on q. Previous techniques either incur a poly(q) factor slowdown in their runtime or remove the dependence on q at the expense of having sub-optimal target dimension, and depend quadratically on the number of data-points in their runtime. Our sampling technique relies on a collection of q partially correlated random projections which can be simultaneously applied to a dataset X in total time that only depends on the size of X, and at the same time their q-fold Kronecker product acts as a near-isometry for any fixed vector in the column span of $X^{\\otimes q}$. We also show that our sampling methods generalize to other classes of kernels beyond polynomial, such as Gaussian and Neural Tangent kernels.", "bibtex": "@InProceedings{pmlr-v162-woodruff22a,\n title = \t {Leverage Score Sampling for Tensor Product Matrices in Input Sparsity Time},\n author = {Woodruff, David and Zandieh, Amir},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23933--23964},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/woodruff22a/woodruff22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/woodruff22a.html},\n abstract = \t {We propose an input sparsity time sampling algorithm that can spectrally approximate the Gram matrix corresponding to the q-fold column-wise tensor product of q matrices using a nearly optimal number of samples, improving upon all previously known methods by poly(q) factors. Furthermore, for the important special case of the q-fold self-tensoring of a dataset, which is the feature matrix of the degree-q polynomial kernel, the leading term of our method\u2019s runtime is proportional to the size of the dataset and has no dependence on q. Previous techniques either incur a poly(q) factor slowdown in their runtime or remove the dependence on q at the expense of having sub-optimal target dimension, and depend quadratically on the number of data-points in their runtime. Our sampling technique relies on a collection of q partially correlated random projections which can be simultaneously applied to a dataset X in total time that only depends on the size of X, and at the same time their q-fold Kronecker product acts as a near-isometry for any fixed vector in the column span of $X^{\\otimes q}$. We also show that our sampling methods generalize to other classes of kernels beyond polynomial, such as Gaussian and Neural Tangent kernels.}\n}", "pdf": "https://proceedings.mlr.press/v162/woodruff22a/woodruff22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/woodruff22a-supp.zip", "pdf_size": 888055, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17475218619014582696&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Max-Planck-Institut f \u00a8ur Informatik; Carnegie Mellon University", "aff_domain": "cs.cmu.edu;mpi-inf.mpg.de", "email": "cs.cmu.edu;mpi-inf.mpg.de", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/woodruff22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Max-Planck-Institut f\u00fcr Informatik;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://mpi-sws.org;https://www.cmu.edu", "aff_unique_abbr": "MPII;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;United States" }, { "title": "Leveraging Approximate Symbolic Models for Reinforcement Learning via Skill Diversity", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17381", "id": "17381", "proceeding": "https://proceedings.mlr.press/v162/guan22c.html", "poster": "/media/PosterPDFs/ICML%202022/fd9042c9b077fe393b18ae78feb58279_SFKCTbF.png?t=1658160900.6127713", "slides": "/media/icml-2022/Slides/17381.pdf", "author_site": "Lin Guan, Sarath Sreedharan, Subbarao Kambhampati", "author": "Lin Guan; Sarath Sreedharan; Subbarao Kambhampati", "abstract": "Creating reinforcement learning (RL) agents that are capable of accepting and leveraging task-specific knowledge from humans has been long identified as a possible strategy for developing scalable approaches for solving long-horizon problems. While previous works have looked at the possibility of using symbolic models along with RL approaches, they tend to assume that the high-level action models are executable at low level and the fluents can exclusively characterize all desirable MDP states. Symbolic models of real world tasks are however often incomplete. To this end, we introduce Approximate Symbolic-Model Guided Reinforcement Learning, wherein we will formalize the relationship between the symbolic model and the underlying MDP that will allow us to characterize the incompleteness of the symbolic model. We will use these models to extract high-level landmarks that will be used to decompose the task. At the low level, we learn a set of diverse policies for each possible task subgoal identified by the landmark, which are then stitched together. We evaluate our system by testing on three different benchmark domains and show how even with incomplete symbolic model information, our approach is able to discover the task structure and efficiently guide the RL agent towards the goal.", "bibtex": "@InProceedings{pmlr-v162-guan22c,\n title = \t {Leveraging Approximate Symbolic Models for Reinforcement Learning via Skill Diversity},\n author = {Guan, Lin and Sreedharan, Sarath and Kambhampati, Subbarao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7949--7967},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guan22c/guan22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/guan22c.html},\n abstract = \t {Creating reinforcement learning (RL) agents that are capable of accepting and leveraging task-specific knowledge from humans has been long identified as a possible strategy for developing scalable approaches for solving long-horizon problems. While previous works have looked at the possibility of using symbolic models along with RL approaches, they tend to assume that the high-level action models are executable at low level and the fluents can exclusively characterize all desirable MDP states. Symbolic models of real world tasks are however often incomplete. To this end, we introduce Approximate Symbolic-Model Guided Reinforcement Learning, wherein we will formalize the relationship between the symbolic model and the underlying MDP that will allow us to characterize the incompleteness of the symbolic model. We will use these models to extract high-level landmarks that will be used to decompose the task. At the low level, we learn a set of diverse policies for each possible task subgoal identified by the landmark, which are then stitched together. We evaluate our system by testing on three different benchmark domains and show how even with incomplete symbolic model information, our approach is able to discover the task structure and efficiently guide the RL agent towards the goal.}\n}", "pdf": "https://proceedings.mlr.press/v162/guan22c/guan22c.pdf", "supp": "", "pdf_size": 3336265, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9607066569965060600&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "School of Computing & AI, Arizona State University; School of Computing & AI, Arizona State University; School of Computing & AI, Arizona State University", "aff_domain": "asu.edu; ; ", "email": "asu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/guan22c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Arizona State University", "aff_unique_dep": "School of Computing & AI", "aff_unique_url": "https://asu.edu", "aff_unique_abbr": "ASU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Tempe", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Lie Point Symmetry Data Augmentation for Neural PDE Solvers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17313", "id": "17313", "proceeding": "https://proceedings.mlr.press/v162/brandstetter22a.html", "poster": "/media/PosterPDFs/ICML%202022/764a9f2462bf088af07b6ae6c107e62c_VOEP8nM.png?t=1657746593.9794528", "slides": "", "author_site": "Johannes Brandstetter, Max Welling, Daniel Worrall", "author": "Johannes Brandstetter; Max Welling; Daniel E Worrall", "abstract": "Neural networks are increasingly being used to solve partial differential equations (PDEs), replacing slower numerical solvers. However, a critical issue is that neural PDE solvers require high-quality ground truth data, which usually must come from the very solvers they are designed to replace. Thus, we are presented with a proverbial chicken-and-egg problem. In this paper, we present a method, which can partially alleviate this problem, by improving neural PDE solver sample complexity\u2014Lie point symmetry data augmentation (LPSDA). In the context of PDEs, it turns out we are able to quantitatively derive an exhaustive list of data transformations, based on the Lie point symmetry group of the PDEs in question, something not possible in other application areas. We present this framework and demonstrate how it can easily be deployed to improve neural PDE solver sample complexity by an order of magnitude.", "bibtex": "@InProceedings{pmlr-v162-brandstetter22a,\n title = \t {Lie Point Symmetry Data Augmentation for Neural {PDE} Solvers},\n author = {Brandstetter, Johannes and Welling, Max and Worrall, Daniel E},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2241--2256},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/brandstetter22a/brandstetter22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/brandstetter22a.html},\n abstract = \t {Neural networks are increasingly being used to solve partial differential equations (PDEs), replacing slower numerical solvers. However, a critical issue is that neural PDE solvers require high-quality ground truth data, which usually must come from the very solvers they are designed to replace. Thus, we are presented with a proverbial chicken-and-egg problem. In this paper, we present a method, which can partially alleviate this problem, by improving neural PDE solver sample complexity\u2014Lie point symmetry data augmentation (LPSDA). In the context of PDEs, it turns out we are able to quantitatively derive an exhaustive list of data transformations, based on the Lie point symmetry group of the PDEs in question, something not possible in other application areas. We present this framework and demonstrate how it can easily be deployed to improve neural PDE solver sample complexity by an order of magnitude.}\n}", "pdf": "https://proceedings.mlr.press/v162/brandstetter22a/brandstetter22a.pdf", "supp": "", "pdf_size": 3631144, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6135726084743263275&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "University of Amsterdam + Microsoft Research; University of Amsterdam; Qualcomm AI Research + Deepmind", "aff_domain": "ml.jku.at; ; ", "email": "ml.jku.at; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/brandstetter22a.html", "aff_unique_index": "0+1;0;2+3", "aff_unique_norm": "University of Amsterdam;Microsoft;Qualcomm;DeepMind", "aff_unique_dep": ";Microsoft Research;Qualcomm AI Research;", "aff_unique_url": "https://www.uva.nl;https://www.microsoft.com/en-us/research;https://www.qualcomm.com/research;https://deepmind.com", "aff_unique_abbr": "UvA;MSR;QAI;DeepMind", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;1+2", "aff_country_unique": "Netherlands;United States;United Kingdom" }, { "title": "Lightweight Projective Derivative Codes for Compressed Asynchronous Gradient Descent", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17285", "id": "17285", "proceeding": "https://proceedings.mlr.press/v162/soto22a.html", "poster": "/media/PosterPDFs/ICML%202022/442b548e816f05640dec68f497ca38ac.png?t=1657882119.358152", "slides": "/media/icml-2022/Slides/17285.pdf", "author_site": "Pedro Soto, Ilia Ilmer, Haibin Guan, Jun Li", "author": "Pedro J Soto; Ilia Ilmer; Haibin Guan; Jun Li", "abstract": "Coded distributed computation has become common practice for performing gradient descent on large datasets to mitigate stragglers and other faults. This paper proposes a novel algorithm that encodes the partial derivatives themselves and furthermore optimizes the codes by performing lossy compression on the derivative codewords by maximizing the information contained in the codewords while minimizing the information between the codewords. The utility of this application of coding theory is a geometrical consequence of the observed fact in optimization research that noise is tolerable, sometimes even helpful, in gradient descent based learning algorithms since it helps avoid overfitting and local minima. This stands in contrast with much current conventional work on distributed coded computation which focuses on recovering all of the data from the workers. A second further contribution is that the low-weight nature of the coding scheme allows for asynchronous gradient updates since the code can be iteratively decoded; i.e., a worker\u2019s task can immediately be updated into the larger gradient. The directional derivative is always a linear function of the direction vectors; thus, our framework is robust since it can apply linear coding techniques to general machine learning frameworks such as deep neural networks.", "bibtex": "@InProceedings{pmlr-v162-soto22a,\n title = \t {Lightweight Projective Derivative Codes for Compressed Asynchronous Gradient Descent},\n author = {Soto, Pedro J and Ilmer, Ilia and Guan, Haibin and Li, Jun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20444--20458},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/soto22a/soto22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/soto22a.html},\n abstract = \t {Coded distributed computation has become common practice for performing gradient descent on large datasets to mitigate stragglers and other faults. This paper proposes a novel algorithm that encodes the partial derivatives themselves and furthermore optimizes the codes by performing lossy compression on the derivative codewords by maximizing the information contained in the codewords while minimizing the information between the codewords. The utility of this application of coding theory is a geometrical consequence of the observed fact in optimization research that noise is tolerable, sometimes even helpful, in gradient descent based learning algorithms since it helps avoid overfitting and local minima. This stands in contrast with much current conventional work on distributed coded computation which focuses on recovering all of the data from the workers. A second further contribution is that the low-weight nature of the coding scheme allows for asynchronous gradient updates since the code can be iteratively decoded; i.e., a worker\u2019s task can immediately be updated into the larger gradient. The directional derivative is always a linear function of the direction vectors; thus, our framework is robust since it can apply linear coding techniques to general machine learning frameworks such as deep neural networks.}\n}", "pdf": "https://proceedings.mlr.press/v162/soto22a/soto22a.pdf", "supp": "", "pdf_size": 1452953, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4415272542655942425&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, The Graduate Center, CUNY, New York, USA; Department of Computer Science, The Graduate Center, CUNY, New York, USA; Icahn School of Medicine at Mount Sinai, New York, USA; Department of Computer Science, CUNY Queens College & Graduate Center, New York, USA", "aff_domain": "gradcenter.cuny.edu; ; ; ", "email": "gradcenter.cuny.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/soto22a.html", "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Graduate Center, CUNY;Icahn School of Medicine at Mount Sinai;CUNY Queens College", "aff_unique_dep": "Department of Computer Science;;Department of Computer Science", "aff_unique_url": "https://www.gc.cuny.edu;https://icahn.mssm.edu;https://www.qc.cuny.edu", "aff_unique_abbr": "GC CUNY;ISMMS;QC", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "New York;Queens", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Linear Adversarial Concept Erasure", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17321", "id": "17321", "proceeding": "https://proceedings.mlr.press/v162/ravfogel22a.html", "poster": "/media/PosterPDFs/ICML%202022/cbef46321026d8404bc3216d4774c8a9.png?t=1657444639.1441085", "slides": "", "author_site": "Shaul Ravfogel, Michael Twiton, Yoav Goldberg, Ryan Cotterell", "author": "Shauli Ravfogel; Michael Twiton; Yoav Goldberg; Ryan D Cotterell", "abstract": "Modern neural models trained on textual data rely on pre-trained representations that emerge without direct supervision. As these representations are increasingly being used in real-world applications, the inability to", "bibtex": "@InProceedings{pmlr-v162-ravfogel22a,\n title = \t {Linear Adversarial Concept Erasure},\n author = {Ravfogel, Shauli and Twiton, Michael and Goldberg, Yoav and Cotterell, Ryan D},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18400--18421},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ravfogel22a/ravfogel22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ravfogel22a.html},\n abstract = \t {Modern neural models trained on textual data rely on pre-trained representations that emerge without direct supervision. As these representations are increasingly being used in real-world applications, the inability to", "pdf": "https://proceedings.mlr.press/v162/ravfogel22a/ravfogel22a.pdf", "supp": "", "pdf_size": 3472160, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=157683061025883774&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Bar Ilan University+Allen Institue for Artificial Intelligence; Independent researcher; Department of Computer Science, Bar Ilan University+Allen Institue for Artificial Intelligence; ETH Z\u00fcrich", "aff_domain": "gmail.com;gmail.com;gmail.com;inf.ethz.ch", "email": "gmail.com;gmail.com;gmail.com;inf.ethz.ch", "github": "https://github.com/shauli-ravfogel/rlace-icml", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/ravfogel22a.html", "aff_unique_index": "0+1;2;0+1;3", "aff_unique_norm": "Bar-Ilan University;Allen Institute for Artificial Intelligence;Independent Researcher;ETH Zurich", "aff_unique_dep": "Department of Computer Science;;;", "aff_unique_url": "https://www.biu.ac.il;https://allenai.org;;https://www.ethz.ch", "aff_unique_abbr": "BIU;AI2;;ETHZ", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0+1;3", "aff_country_unique": "Israel;United States;;Switzerland" }, { "title": "Linear Bandit Algorithms with Sublinear Time Complexity", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16459", "id": "16459", "proceeding": "https://proceedings.mlr.press/v162/yang22m.html", "poster": "/media/PosterPDFs/ICML%202022/adf7ee2dcf142b0e11888e72b43fcb75.png?t=1657649780.072961", "slides": "", "author_site": "Shuo Yang, Tongzheng Ren, Sanjay Shakkottai, Eric Price, Inderjit Dhillon, Sujay Sanghavi", "author": "Shuo Yang; Tongzheng Ren; Sanjay Shakkottai; Eric Price; Inderjit S. Dhillon; Sujay Sanghavi", "abstract": "We propose two linear bandits algorithms with per-step complexity sublinear in the number of arms $K$. The algorithms are designed for applications where the arm set is extremely large and slowly changing. Our key realization is that choosing an arm reduces to a maximum inner product search (MIPS) problem, which can be solved approximately without breaking regret guarantees. Existing approximate MIPS solvers run in sublinear time. We extend those solvers and present theoretical guarantees for online learning problems, where adaptivity (i.e., a later step depends on the feedback in previous steps) becomes a unique challenge. We then explicitly characterize the tradeoff between the per-step complexity and regret. For sufficiently large $K$, our algorithms have sublinear per-step complexity and $\\widetilde O(\\sqrt{T})$ regret. Empirically, we evaluate our proposed algorithms in a synthetic environment and a real-world online movie recommendation problem. Our proposed algorithms can deliver a more than 72 times speedup compared to the linear time baselines while retaining similar regret.", "bibtex": "@InProceedings{pmlr-v162-yang22m,\n title = \t {Linear Bandit Algorithms with Sublinear Time Complexity},\n author = {Yang, Shuo and Ren, Tongzheng and Shakkottai, Sanjay and Price, Eric and Dhillon, Inderjit S. and Sanghavi, Sujay},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25241--25260},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22m/yang22m.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22m.html},\n abstract = \t {We propose two linear bandits algorithms with per-step complexity sublinear in the number of arms $K$. The algorithms are designed for applications where the arm set is extremely large and slowly changing. Our key realization is that choosing an arm reduces to a maximum inner product search (MIPS) problem, which can be solved approximately without breaking regret guarantees. Existing approximate MIPS solvers run in sublinear time. We extend those solvers and present theoretical guarantees for online learning problems, where adaptivity (i.e., a later step depends on the feedback in previous steps) becomes a unique challenge. We then explicitly characterize the tradeoff between the per-step complexity and regret. For sufficiently large $K$, our algorithms have sublinear per-step complexity and $\\widetilde O(\\sqrt{T})$ regret. Empirically, we evaluate our proposed algorithms in a synthetic environment and a real-world online movie recommendation problem. Our proposed algorithms can deliver a more than 72 times speedup compared to the linear time baselines while retaining similar regret.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22m/yang22m.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/yang22m-supp.zip", "pdf_size": 430822, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15692152906188228674&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of CS, The University of Texas at Austin, TX, USA; Department of CS, The University of Texas at Austin, TX, USA; Department of ECE, The University of Texas at Austin, TX, USA; Department of CS, The University of Texas at Austin, TX, USA; Department of CS, The University of Texas at Austin, TX, USA; Department of ECE, The University of Texas at Austin, TX, USA", "aff_domain": "utexas.edu; ; ; ; ; ", "email": "utexas.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/yang22m.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "Department of CS", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Linear Complexity Randomized Self-attention Mechanism", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16977", "id": "16977", "proceeding": "https://proceedings.mlr.press/v162/zheng22b.html", "poster": "", "slides": "/media/icml-2022/Slides/16977.pdf", "author_site": "Lin Zheng, Chong Wang, Lingpeng Kong", "author": "Lin Zheng; Chong Wang; Lingpeng Kong", "abstract": "Recently, random feature attentions (RFAs) are proposed to approximate the softmax attention in linear time and space complexity by linearizing the exponential kernel. In this paper, we first propose a novel perspective to understand the bias in such approximation by recasting RFAs as self-normalized importance samplers. This perspective further sheds light on an", "bibtex": "@InProceedings{pmlr-v162-zheng22b,\n title = \t {Linear Complexity Randomized Self-attention Mechanism},\n author = {Zheng, Lin and Wang, Chong and Kong, Lingpeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27011--27041},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zheng22b/zheng22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/zheng22b.html},\n abstract = \t {Recently, random feature attentions (RFAs) are proposed to approximate the softmax attention in linear time and space complexity by linearizing the exponential kernel. In this paper, we first propose a novel perspective to understand the bias in such approximation by recasting RFAs as self-normalized importance samplers. This perspective further sheds light on an", "pdf": "https://proceedings.mlr.press/v162/zheng22b/zheng22b.pdf", "supp": "", "pdf_size": 607235, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14626202728773890667&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, The University of Hong Kong + ByteDance Inc.; ByteDance Inc.; Shanghai Artificial Intelligence Laboratory", "aff_domain": "connect.hku.hk; ; ", "email": "connect.hku.hk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zheng22b.html", "aff_unique_index": "0+1;1;2", "aff_unique_norm": "University of Hong Kong;ByteDance;Shanghai Artificial Intelligence Laboratory", "aff_unique_dep": "Department of Computer Science;;", "aff_unique_url": "https://www.hku.hk;https://www.bytedance.com;http://www.shailab.org/", "aff_unique_abbr": "HKU;ByteDance;Shanghai AI Lab", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "China" }, { "title": "Linear-Time Gromov Wasserstein Distances using Low Rank Couplings and Costs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18353", "id": "18353", "proceeding": "https://proceedings.mlr.press/v162/scetbon22b.html", "poster": "/media/PosterPDFs/ICML%202022/e1054bf2d703bca1e8fe101d3ac5efcd.png?t=1657716780.4742649", "slides": "", "author_site": "Meyer Scetbon, Gabriel Peyr\u00e9, Marco Cuturi", "author": "Meyer Scetbon; Gabriel Peyr\u00e9; Marco Cuturi", "abstract": "The ability to align points across two related yet incomparable point clouds (e.g. living in different spaces) plays an important role in machine learning. The Gromov-Wasserstein (GW) framework provides an increasingly popular answer to such problems, by seeking a low-distortion, geometry-preserving assignment between these points. As a non-convex, quadratic generalization of optimal transport (OT), GW is NP-hard. While practitioners often resort to solving GW approximately as a nested sequence of entropy-regularized OT problems, the cubic complexity (in the number $n$ of samples) of that approach is a roadblock. We show in this work how a recent variant of the OT problem that restricts the set of admissible couplings to those having a low-rank factorization is remarkably well suited to the resolution of GW: when applied to GW, we show that this approach is not only able to compute a stationary point of the GW problem in time $O(n^2)$, but also uniquely positioned to benefit from the knowledge that the initial cost matrices are low-rank, to yield a linear time $O(n)$ GW approximation. Our approach yields similar results, yet orders of magnitude faster computation than the SoTA entropic GW approaches, on both simulated and real data.", "bibtex": "@InProceedings{pmlr-v162-scetbon22b,\n title = \t {Linear-Time Gromov {W}asserstein Distances using Low Rank Couplings and Costs},\n author = {Scetbon, Meyer and Peyr{\\'e}, Gabriel and Cuturi, Marco},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19347--19365},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/scetbon22b/scetbon22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/scetbon22b.html},\n abstract = \t {The ability to align points across two related yet incomparable point clouds (e.g. living in different spaces) plays an important role in machine learning. The Gromov-Wasserstein (GW) framework provides an increasingly popular answer to such problems, by seeking a low-distortion, geometry-preserving assignment between these points. As a non-convex, quadratic generalization of optimal transport (OT), GW is NP-hard. While practitioners often resort to solving GW approximately as a nested sequence of entropy-regularized OT problems, the cubic complexity (in the number $n$ of samples) of that approach is a roadblock. We show in this work how a recent variant of the OT problem that restricts the set of admissible couplings to those having a low-rank factorization is remarkably well suited to the resolution of GW: when applied to GW, we show that this approach is not only able to compute a stationary point of the GW problem in time $O(n^2)$, but also uniquely positioned to benefit from the knowledge that the initial cost matrices are low-rank, to yield a linear time $O(n)$ GW approximation. Our approach yields similar results, yet orders of magnitude faster computation than the SoTA entropic GW approaches, on both simulated and real data.}\n}", "pdf": "https://proceedings.mlr.press/v162/scetbon22b/scetbon22b.pdf", "supp": "", "pdf_size": 5824632, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=883418138428344777&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "CREST-ENSAE; CNRS and ENS, PSL; CREST-ENSAE + Google + Apple", "aff_domain": "ensae.fr; ; ", "email": "ensae.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/scetbon22b.html", "aff_unique_index": "0;1;0+2+3", "aff_unique_norm": "CREST - Ecole Nationale de la Statistique et de l'Administration Economique;CNRS;Google;Apple", "aff_unique_dep": ";;Google;Apple Inc.", "aff_unique_url": "https://www.crest.fr;https://www.cnrs.fr;https://www.google.com;https://www.apple.com", "aff_unique_abbr": "CREST-ENSAE;CNRS;Google;Apple", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0+1+1", "aff_country_unique": "France;United States" }, { "title": "Linearity Grafting: Relaxed Neuron Pruning Helps Certifiable Robustness", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17553", "id": "17553", "proceeding": "https://proceedings.mlr.press/v162/chen22af.html", "poster": "/media/PosterPDFs/ICML%202022/db5f9f42a7157abe65bb145000b5871a.png?t=1657516552.3992095", "slides": "", "author_site": "Tianlong Chen, Huan Zhang, Zhenyu Zhang, Shiyu Chang, Sijia Liu, Pin-Yu Chen, Zhangyang \u201cAtlas\u201d Wang", "author": "Tianlong Chen; Huan Zhang; Zhenyu Zhang; Shiyu Chang; Sijia Liu; Pin-Yu Chen; Zhangyang Wang", "abstract": "Certifiable robustness is a highly desirable property for adopting deep neural networks (DNNs) in safety-critical scenarios, but often demands tedious computations to establish. The main hurdle lies in the massive amount of non-linearity in large DNNs. To trade off the DNN expressiveness (which calls for more non-linearity) and robustness certification scalability (which prefers more linearity), we propose a novel solution to strategically manipulate neurons, by \"grafting\" appropriate levels of linearity. The core of our proposal is to first linearize insignificant ReLU neurons, to eliminate the non-linear components that are both redundant for DNN performance and harmful to its certification. We then optimize the associated slopes and intercepts of the replaced linear activations for restoring model performance while maintaining certifiability. Hence, typical neuron pruning could be viewed as a special case of grafting a linear function of the fixed zero slopes and intercept, that might overly restrict the network flexibility and sacrifice its performance. Extensive experiments on multiple datasets and network backbones show that our linearity grafting can (1) effectively tighten certified bounds; (2) achieve competitive certifiable robustness without certified robust training (i.e., over 30% improvements on CIFAR-10 models); and (3) scale up complete verification to large adversarially trained models with 17M parameters. Codes are available at https://github.com/VITA-Group/Linearity-Grafting.", "bibtex": "@InProceedings{pmlr-v162-chen22af,\n title = \t {Linearity Grafting: Relaxed Neuron Pruning Helps Certifiable Robustness},\n author = {Chen, Tianlong and Zhang, Huan and Zhang, Zhenyu and Chang, Shiyu and Liu, Sijia and Chen, Pin-Yu and Wang, Zhangyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3760--3772},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22af/chen22af.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22af.html},\n abstract = \t {Certifiable robustness is a highly desirable property for adopting deep neural networks (DNNs) in safety-critical scenarios, but often demands tedious computations to establish. The main hurdle lies in the massive amount of non-linearity in large DNNs. To trade off the DNN expressiveness (which calls for more non-linearity) and robustness certification scalability (which prefers more linearity), we propose a novel solution to strategically manipulate neurons, by \"grafting\" appropriate levels of linearity. The core of our proposal is to first linearize insignificant ReLU neurons, to eliminate the non-linear components that are both redundant for DNN performance and harmful to its certification. We then optimize the associated slopes and intercepts of the replaced linear activations for restoring model performance while maintaining certifiability. Hence, typical neuron pruning could be viewed as a special case of grafting a linear function of the fixed zero slopes and intercept, that might overly restrict the network flexibility and sacrifice its performance. Extensive experiments on multiple datasets and network backbones show that our linearity grafting can (1) effectively tighten certified bounds; (2) achieve competitive certifiable robustness without certified robust training (i.e., over 30% improvements on CIFAR-10 models); and (3) scale up complete verification to large adversarially trained models with 17M parameters. Codes are available at https://github.com/VITA-Group/Linearity-Grafting.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22af/chen22af.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chen22af-supp.zip", "pdf_size": 3103155, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2944620875879702886&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "University of Texas at Austin; Carnegie Mellon University; University of Texas at Austin; University of California, Santa Barbara; Michigan State University+MIT-IBM Watson AI Lab; MIT-IBM Watson AI Lab+IBM Research; University of Texas at Austin", "aff_domain": "utexas.edu; ; ; ; ; ;utexas.edu", "email": "utexas.edu; ; ; ; ; ;utexas.edu", "github": "https://github.com/VITA-Group/Linearity-Grafting", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/chen22af.html", "aff_unique_index": "0;1;0;2;3+4;4+5;0", "aff_unique_norm": "University of Texas at Austin;Carnegie Mellon University;University of California, Santa Barbara;Michigan State University;Massachusetts Institute of Technology;IBM", "aff_unique_dep": ";;;;IBM Watson AI Lab;IBM Research", "aff_unique_url": "https://www.utexas.edu;https://www.cmu.edu;https://www.ucsb.edu;https://www.msu.edu;https://www.mitibmwatsonailab.org;https://www.ibm.com/research", "aff_unique_abbr": "UT Austin;CMU;UCSB;MSU;MIT-IBM AI Lab;IBM", "aff_campus_unique_index": "0;0;2;;;0", "aff_campus_unique": "Austin;;Santa Barbara", "aff_country_unique_index": "0;0;0;0;0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Local Augmentation for Graph Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16799", "id": "16799", "proceeding": "https://proceedings.mlr.press/v162/liu22s.html", "poster": "/media/PosterPDFs/ICML%202022/6974ce5ac660610b44d9b9fed0ff9548.png?t=1657490665.5703084", "slides": "", "author_site": "Songtao Liu, Rex (Zhitao) Ying, Hanze Dong, Lanqing Li, Tingyang Xu, Yu Rong, Peilin Zhao, Junzhou Huang, Dinghao Wu", "author": "Songtao Liu; Rex Ying; Hanze Dong; Lanqing Li; Tingyang Xu; Yu Rong; Peilin Zhao; Junzhou Huang; Dinghao Wu", "abstract": "Graph Neural Networks (GNNs) have achieved remarkable performance on graph-based tasks. The key idea for GNNs is to obtain informative representation through aggregating information from local neighborhoods. However, it remains an open question whether the neighborhood information is adequately aggregated for learning representations of nodes with few neighbors. To address this, we propose a simple and efficient data augmentation strategy, local augmentation, to learn the distribution of the node representations of the neighbors conditioned on the central node\u2019s representation and enhance GNN\u2019s expressive power with generated features. Local augmentation is a general framework that can be applied to any GNN model in a plug-and-play manner. It samples feature vectors associated with each node from the learned conditional distribution as additional input for the backbone model at each training iteration. Extensive experiments and analyses show that local augmentation consistently yields performance improvement when applied to various GNN architectures across a diverse set of benchmarks. For example, experiments show that plugging in local augmentation to GCN and GAT improves by an average of 3.4% and 1.6% in terms of test accuracy on Cora, Citeseer, and Pubmed. Besides, our experimental results on large graphs (OGB) show that our model consistently improves performance over backbones. Code is available at https://github.com/SongtaoLiu0823/LAGNN.", "bibtex": "@InProceedings{pmlr-v162-liu22s,\n title = \t {Local Augmentation for Graph Neural Networks},\n author = {Liu, Songtao and Ying, Rex and Dong, Hanze and Li, Lanqing and Xu, Tingyang and Rong, Yu and Zhao, Peilin and Huang, Junzhou and Wu, Dinghao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14054--14072},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22s/liu22s.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22s.html},\n abstract = \t {Graph Neural Networks (GNNs) have achieved remarkable performance on graph-based tasks. The key idea for GNNs is to obtain informative representation through aggregating information from local neighborhoods. However, it remains an open question whether the neighborhood information is adequately aggregated for learning representations of nodes with few neighbors. To address this, we propose a simple and efficient data augmentation strategy, local augmentation, to learn the distribution of the node representations of the neighbors conditioned on the central node\u2019s representation and enhance GNN\u2019s expressive power with generated features. Local augmentation is a general framework that can be applied to any GNN model in a plug-and-play manner. It samples feature vectors associated with each node from the learned conditional distribution as additional input for the backbone model at each training iteration. Extensive experiments and analyses show that local augmentation consistently yields performance improvement when applied to various GNN architectures across a diverse set of benchmarks. For example, experiments show that plugging in local augmentation to GCN and GAT improves by an average of 3.4% and 1.6% in terms of test accuracy on Cora, Citeseer, and Pubmed. Besides, our experimental results on large graphs (OGB) show that our model consistently improves performance over backbones. Code is available at https://github.com/SongtaoLiu0823/LAGNN.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22s/liu22s.pdf", "supp": "", "pdf_size": 560448, "gs_citation": 140, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1477899180662383839&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "The Pennsylvania State University; Stanford University; Hong Kong University of Science and Technology; Tencent AI Lab; Tencent AI Lab; Tencent AI Lab; Tencent AI Lab; Tencent AI Lab; The Pennsylvania State University", "aff_domain": "psu.edu;stanford.edu; ;gmail.com; ; ; ; ;psu.edu", "email": "psu.edu;stanford.edu; ;gmail.com; ; ; ; ;psu.edu", "github": "https://github.com/SongtaoLiu0823/LAGNN", "project": "", "author_num": 9, "oa": "https://proceedings.mlr.press/v162/liu22s.html", "aff_unique_index": "0;1;2;3;3;3;3;3;0", "aff_unique_norm": "Pennsylvania State University;Stanford University;Hong Kong University of Science and Technology;Tencent", "aff_unique_dep": ";;;Tencent AI Lab", "aff_unique_url": "https://www.psu.edu;https://www.stanford.edu;https://www.ust.hk;https://ai.tencent.com", "aff_unique_abbr": "PSU;Stanford;HKUST;Tencent AI Lab", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Stanford;Hong Kong SAR", "aff_country_unique_index": "0;0;1;1;1;1;1;1;0", "aff_country_unique": "United States;China" }, { "title": "Local Linear Convergence of Douglas-Rachford for Linear Programming: a Probabilistic Analysis", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16421", "id": "16421", "proceeding": "https://proceedings.mlr.press/v162/faust22a.html", "poster": "", "slides": "", "author_site": "Oisin Faust, Hamza Fawzi", "author": "Oisin Faust; Hamza Fawzi", "abstract": "Douglas-Rachford splitting/ADMM (henceforth DRS) is a very popular algorithm for solving convex optimisation problems to low or moderate accuracy, and in particular for solving large-scale linear programs. Despite recent progress, obtaining highly accurate solutions to linear programs with DRS remains elusive. In this paper we analyze the local linear convergence rate $r$ of the DRS method for random linear programs, and give explicit and tight bounds on $r$. We show that $1-r^2$ is typically of the order of $m^{-1}(n-m)^{-1}$, where $n$ is the number of variables and $m$ is the number of constraints. This provides a quantitative explanation for the very slow convergence of DRS/ADMM on random LPs. The proof of our result relies on an established characterisation of the linear rate of convergence as the cosine of the Friedrichs angle between two subspaces associated to the problem. We also show that the cosecant of this angle can be interpreted as a condition number for the LP. The proof of our result relies on a characterization of the linear rate of convergence as the cosine of the Friedrichs angle between two subspaces associated to the problem. We also show that the cosecant of this angle can be interpreted as a condition number for the LP.", "bibtex": "@InProceedings{pmlr-v162-faust22a,\n title = \t {Local Linear Convergence of Douglas-Rachford for Linear Programming: a Probabilistic Analysis},\n author = {Faust, Oisin and Fawzi, Hamza},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6358--6372},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/faust22a/faust22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/faust22a.html},\n abstract = \t {Douglas-Rachford splitting/ADMM (henceforth DRS) is a very popular algorithm for solving convex optimisation problems to low or moderate accuracy, and in particular for solving large-scale linear programs. Despite recent progress, obtaining highly accurate solutions to linear programs with DRS remains elusive. In this paper we analyze the local linear convergence rate $r$ of the DRS method for random linear programs, and give explicit and tight bounds on $r$. We show that $1-r^2$ is typically of the order of $m^{-1}(n-m)^{-1}$, where $n$ is the number of variables and $m$ is the number of constraints. This provides a quantitative explanation for the very slow convergence of DRS/ADMM on random LPs. The proof of our result relies on an established characterisation of the linear rate of convergence as the cosine of the Friedrichs angle between two subspaces associated to the problem. We also show that the cosecant of this angle can be interpreted as a condition number for the LP. The proof of our result relies on a characterization of the linear rate of convergence as the cosine of the Friedrichs angle between two subspaces associated to the problem. We also show that the cosecant of this angle can be interpreted as a condition number for the LP.}\n}", "pdf": "https://proceedings.mlr.press/v162/faust22a/faust22a.pdf", "supp": "", "pdf_size": 513972, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10945923704447114719&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Department of Applied Mathematics and Theoretical Physics, University of Cambridge, Cambridge, United Kingdom + Cantab Capital Institute for the Mathematics of Information, University of Cambridge, Cambridge, United Kingdom; Department of Applied Mathematics and Theoretical Physics, University of Cambridge, Cambridge, United Kingdom + Cantab Capital Institute for the Mathematics of Information, University of Cambridge, Cambridge, United Kingdom", "aff_domain": "maths.cam.ac.uk; ", "email": "maths.cam.ac.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/faust22a.html", "aff_unique_index": "0+0;0+0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "Department of Applied Mathematics and Theoretical Physics", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "United Kingdom" }, { "title": "Locally Sparse Neural Networks for Tabular Biomedical Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16809", "id": "16809", "proceeding": "https://proceedings.mlr.press/v162/yang22i.html", "poster": "/media/PosterPDFs/ICML%202022/dffac38df13c3a801f1b8994f9303bcc_qB4aQZr.png?t=1657750380.0357404", "slides": "", "author_site": "Junchen Yang, Ofir Lindenbaum, Yuval Kluger", "author": "Junchen Yang; Ofir Lindenbaum; Yuval Kluger", "abstract": "Tabular datasets with low-sample-size or many variables are prevalent in biomedicine. Practitioners in this domain prefer linear or tree-based models over neural networks since the latter are harder to interpret and tend to overfit when applied to tabular datasets. To address these neural networks\u2019 shortcomings, we propose an intrinsically interpretable network for heterogeneous biomedical data. We design a locally sparse neural network where the local sparsity is learned to identify the subset of most relevant features for each sample. This sample-specific sparsity is predicted via a gating network, which is trained in tandem with the prediction network. By forcing the model to select a subset of the most informative features for each sample, we reduce model overfitting in low-sample-size data and obtain an interpretable model. We demonstrate that our method outperforms state-of-the-art models when applied to synthetic or real-world biomedical datasets using extensive experiments. Furthermore, the proposed framework dramatically outperforms existing schemes when evaluating its interpretability capabilities. Finally, we demonstrate the applicability of our model to two important biomedical tasks: survival analysis and marker gene identification.", "bibtex": "@InProceedings{pmlr-v162-yang22i,\n title = \t {Locally Sparse Neural Networks for Tabular Biomedical Data},\n author = {Yang, Junchen and Lindenbaum, Ofir and Kluger, Yuval},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25123--25153},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22i/yang22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22i.html},\n abstract = \t {Tabular datasets with low-sample-size or many variables are prevalent in biomedicine. Practitioners in this domain prefer linear or tree-based models over neural networks since the latter are harder to interpret and tend to overfit when applied to tabular datasets. To address these neural networks\u2019 shortcomings, we propose an intrinsically interpretable network for heterogeneous biomedical data. We design a locally sparse neural network where the local sparsity is learned to identify the subset of most relevant features for each sample. This sample-specific sparsity is predicted via a gating network, which is trained in tandem with the prediction network. By forcing the model to select a subset of the most informative features for each sample, we reduce model overfitting in low-sample-size data and obtain an interpretable model. We demonstrate that our method outperforms state-of-the-art models when applied to synthetic or real-world biomedical datasets using extensive experiments. Furthermore, the proposed framework dramatically outperforms existing schemes when evaluating its interpretability capabilities. Finally, we demonstrate the applicability of our model to two important biomedical tasks: survival analysis and marker gene identification.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22i/yang22i.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/yang22i-supp.zip", "pdf_size": 6123527, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=521653928336269491&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Interdepartmental Program in Computational Biology and Bioinformatics, Yale University, New Haven, CT, USA+Applied Math Program, Yale University, New Haven, CT, USA+Department of Pathology, School of Medicine, Yale University, New Haven, CT, USA; Faculty of Engineering, Bar Ilan University, Ramat Gan, Israel; Interdepartmental Program in Computational Biology and Bioinformatics, Yale University, New Haven, CT, USA+Applied Math Program, Yale University, New Haven, CT, USA+Department of Pathology, School of Medicine, Yale University, New Haven, CT, USA", "aff_domain": "yale.edu;biu.ac.il;yale.edu", "email": "yale.edu;biu.ac.il;yale.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/yang22i.html", "aff_unique_index": "0+0+0;1;0+0+0", "aff_unique_norm": "Yale University;Bar-Ilan University", "aff_unique_dep": "Interdepartmental Program in Computational Biology and Bioinformatics;Faculty of Engineering", "aff_unique_url": "https://www.yale.edu;https://www.biu.ac.il", "aff_unique_abbr": "Yale;BIU", "aff_campus_unique_index": "0+0+0;1;0+0+0", "aff_campus_unique": "New Haven;Ramat Gan", "aff_country_unique_index": "0+0+0;1;0+0+0", "aff_country_unique": "United States;Israel" }, { "title": "Log-Euclidean Signatures for Intrinsic Distances Between Unaligned Datasets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16407", "id": "16407", "proceeding": "https://proceedings.mlr.press/v162/shnitzer22a.html", "poster": "/media/PosterPDFs/ICML%202022/59dfa2df42d9e3d41f5b02bfc32229dd.png?t=1657726770.473695", "slides": "/media/icml-2022/Slides/16407.pdf", "author_site": "Tal Shnitzer, Mikhail Yurochkin, Kristjan Greenewald, Justin Solomon", "author": "Tal Shnitzer; Mikhail Yurochkin; Kristjan Greenewald; Justin M Solomon", "abstract": "The need for efficiently comparing and representing datasets with unknown alignment spans various fields, from model analysis and comparison in machine learning to trend discovery in collections of medical datasets. We use manifold learning to compare the intrinsic geometric structures of different datasets by comparing their diffusion operators, symmetric positive-definite (SPD) matrices that relate to approximations of the continuous Laplace-Beltrami operator from discrete samples. Existing methods typically assume known data alignment and compare such operators in a pointwise manner. Instead, we exploit the Riemannian geometry of SPD matrices to compare these operators and define a new theoretically-motivated distance based on a lower bound of the log-Euclidean metric. Our framework facilitates comparison of data manifolds expressed in datasets with different sizes, numbers of features, and measurement modalities. Our log-Euclidean signature (LES) distance recovers meaningful structural differences, outperforming competing methods in various application domains.", "bibtex": "@InProceedings{pmlr-v162-shnitzer22a,\n title = \t {Log-{E}uclidean Signatures for Intrinsic Distances Between Unaligned Datasets},\n author = {Shnitzer, Tal and Yurochkin, Mikhail and Greenewald, Kristjan and Solomon, Justin M},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20106--20124},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shnitzer22a/shnitzer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/shnitzer22a.html},\n abstract = \t {The need for efficiently comparing and representing datasets with unknown alignment spans various fields, from model analysis and comparison in machine learning to trend discovery in collections of medical datasets. We use manifold learning to compare the intrinsic geometric structures of different datasets by comparing their diffusion operators, symmetric positive-definite (SPD) matrices that relate to approximations of the continuous Laplace-Beltrami operator from discrete samples. Existing methods typically assume known data alignment and compare such operators in a pointwise manner. Instead, we exploit the Riemannian geometry of SPD matrices to compare these operators and define a new theoretically-motivated distance based on a lower bound of the log-Euclidean metric. Our framework facilitates comparison of data manifolds expressed in datasets with different sizes, numbers of features, and measurement modalities. Our log-Euclidean signature (LES) distance recovers meaningful structural differences, outperforming competing methods in various application domains.}\n}", "pdf": "https://proceedings.mlr.press/v162/shnitzer22a/shnitzer22a.pdf", "supp": "", "pdf_size": 5819209, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=528448898197574004&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "MIT CSAIL; IBM Research + MIT-IBM Watson AI Lab; IBM Research + MIT-IBM Watson AI Lab; MIT CSAIL", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/shnitzer22a.html", "aff_unique_index": "0;1+0;1+0;0", "aff_unique_norm": "Massachusetts Institute of Technology;IBM", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;IBM Research", "aff_unique_url": "https://www.csail.mit.edu;https://www.ibm.com/research", "aff_unique_abbr": "MIT CSAIL;IBM", "aff_campus_unique_index": "0;;;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Loss Function Learning for Domain Generalization by Implicit Gradient", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16425", "id": "16425", "proceeding": "https://proceedings.mlr.press/v162/gao22b.html", "poster": "/media/PosterPDFs/ICML%202022/aace49c7d80767cffec0e513ae886df0.png?t=1657532905.4468107", "slides": "/media/icml-2022/Slides/16425.pdf", "author_site": "Boyan Gao, Henry Gouk, Yongxin Yang, Timothy Hospedales", "author": "Boyan Gao; Henry Gouk; Yongxin Yang; Timothy Hospedales", "abstract": "Generalising robustly to distribution shift is a major challenge that is pervasive across most real-world applications of machine learning. A recent study highlighted that many advanced algorithms proposed to tackle such domain generalisation (DG) fail to outperform a properly tuned empirical risk minimisation (ERM) baseline. We take a different approach, and explore the impact of the ERM loss function on out-of-domain generalisation. In particular, we introduce a novel meta-learning approach to loss function search based on implicit gradient. This enables us to discover a general purpose parametric loss function that provides a drop-in replacement for cross-entropy. Our loss can be used in standard training pipelines to efficiently train robust models using any neural architecture on new datasets. The results show that it clearly surpasses cross-entropy, enables simple ERM to outperform some more complicated prior DG methods, and provides state-of-the-art performance across a variety of DG benchmarks. Furthermore, unlike most existing DG approaches, our setup applies to the most practical setting of single-source domain generalisation, on which we show significant improvement.", "bibtex": "@InProceedings{pmlr-v162-gao22b,\n title = \t {Loss Function Learning for Domain Generalization by Implicit Gradient},\n author = {Gao, Boyan and Gouk, Henry and Yang, Yongxin and Hospedales, Timothy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7002--7016},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gao22b/gao22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/gao22b.html},\n abstract = \t {Generalising robustly to distribution shift is a major challenge that is pervasive across most real-world applications of machine learning. A recent study highlighted that many advanced algorithms proposed to tackle such domain generalisation (DG) fail to outperform a properly tuned empirical risk minimisation (ERM) baseline. We take a different approach, and explore the impact of the ERM loss function on out-of-domain generalisation. In particular, we introduce a novel meta-learning approach to loss function search based on implicit gradient. This enables us to discover a general purpose parametric loss function that provides a drop-in replacement for cross-entropy. Our loss can be used in standard training pipelines to efficiently train robust models using any neural architecture on new datasets. The results show that it clearly surpasses cross-entropy, enables simple ERM to outperform some more complicated prior DG methods, and provides state-of-the-art performance across a variety of DG benchmarks. Furthermore, unlike most existing DG approaches, our setup applies to the most practical setting of single-source domain generalisation, on which we show significant improvement.}\n}", "pdf": "https://proceedings.mlr.press/v162/gao22b/gao22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/gao22b-supp.zip", "pdf_size": 780667, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2351875210347766586&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "School of Informatics, University of Edinburgh+Samsung AI Center, Cambridge; School of Informatics, University of Edinburgh; School of Informatics, University of Edinburgh; School of Informatics, University of Edinburgh+Samsung AI Center, Cambridge", "aff_domain": "ed.ac.uk; ; ; ", "email": "ed.ac.uk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/gao22b.html", "aff_unique_index": "0+1;0;0;0+1", "aff_unique_norm": "University of Edinburgh;Samsung", "aff_unique_dep": "School of Informatics;AI Center", "aff_unique_url": "https://www.ed.ac.uk;https://www.samsung.com/global/research-innovation/ai-research-centers/samsung-ai-center-cambridge/", "aff_unique_abbr": "Edinburgh;SAC", "aff_campus_unique_index": "0+1;0;0;0+1", "aff_campus_unique": "Edinburgh;Cambridge", "aff_country_unique_index": "0+0;0;0;0+0", "aff_country_unique": "United Kingdom" }, { "title": "Low-Complexity Deep Convolutional Neural Networks on Fully Homomorphic Encryption Using Multiplexed Parallel Convolutions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17801", "id": "17801", "proceeding": "https://proceedings.mlr.press/v162/lee22e.html", "poster": "/media/PosterPDFs/ICML%202022/445e1050156c6ae8c082a8422bb7dfc0.png?t=1657812123.4378915", "slides": "", "author_site": "Eunsang Lee, Joon-Woo Lee, Junghyun Lee, Young-Sik KIM, Yongjune Kim, Jong-Seon No, Woosuk Choi", "author": "Eunsang Lee; Joon-Woo Lee; Junghyun Lee; Young-Sik Kim; Yongjune Kim; Jong-Seon No; Woosuk Choi", "abstract": "Recently, the standard ResNet-20 network was successfully implemented on the fully homomorphic encryption scheme, residue number system variant Cheon-Kim-Kim-Song (RNS-CKKS) scheme using bootstrapping, but the implementation lacks practicality due to high latency and low security level. To improve the performance, we first minimize total bootstrapping runtime using multiplexed parallel convolution that collects sparse output data for multiple channels compactly. We also propose the imaginary-removing bootstrapping to prevent the deep neural networks from catastrophic divergence during approximate ReLU operations. In addition, we optimize level consumptions and use lighter and tighter parameters. Simulation results show that we have 4.67x lower inference latency and 134x less amortized runtime (runtime per image) for ResNet-20 compared to the state-of-the-art previous work, and we achieve standard 128-bit security. Furthermore, we successfully implement ResNet-110 with high accuracy on the RNS-CKKS scheme for the first time.", "bibtex": "@InProceedings{pmlr-v162-lee22e,\n title = \t {Low-Complexity Deep Convolutional Neural Networks on Fully Homomorphic Encryption Using Multiplexed Parallel Convolutions},\n author = {Lee, Eunsang and Lee, Joon-Woo and Lee, Junghyun and Kim, Young-Sik and Kim, Yongjune and No, Jong-Seon and Choi, Woosuk},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12403--12422},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lee22e/lee22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/lee22e.html},\n abstract = \t {Recently, the standard ResNet-20 network was successfully implemented on the fully homomorphic encryption scheme, residue number system variant Cheon-Kim-Kim-Song (RNS-CKKS) scheme using bootstrapping, but the implementation lacks practicality due to high latency and low security level. To improve the performance, we first minimize total bootstrapping runtime using multiplexed parallel convolution that collects sparse output data for multiple channels compactly. We also propose the imaginary-removing bootstrapping to prevent the deep neural networks from catastrophic divergence during approximate ReLU operations. In addition, we optimize level consumptions and use lighter and tighter parameters. Simulation results show that we have 4.67x lower inference latency and 134x less amortized runtime (runtime per image) for ResNet-20 compared to the state-of-the-art previous work, and we achieve standard 128-bit security. Furthermore, we successfully implement ResNet-110 with high accuracy on the RNS-CKKS scheme for the first time.}\n}", "pdf": "https://proceedings.mlr.press/v162/lee22e/lee22e.pdf", "supp": "", "pdf_size": 1352871, "gs_citation": 166, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14019419488949017881&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Dept. of Electrical and Computer Engineering, INMC, Seoul National University; Dept. of Electrical and Computer Engineering, INMC, Seoul National University; Dept. of Electrical and Computer Engineering, INMC, Seoul National University; Dept. of Information and Communication Engineering, Chosun University; Dept. of Electrical Engineering and Computer Science, DGIST; Dept. of Electrical and Computer Engineering, INMC, Seoul National University; Samsung Advanced Institute of Technology", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;chosun.ac.kr;dgist.ac.kr;snu.ac.kr;samsung.com", "email": "snu.ac.kr;snu.ac.kr;snu.ac.kr;chosun.ac.kr;dgist.ac.kr;snu.ac.kr;samsung.com", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/lee22e.html", "aff_unique_index": "0;0;0;1;2;0;3", "aff_unique_norm": "Seoul National University;Chosun University;DGIST;Samsung", "aff_unique_dep": "Dept. of Electrical and Computer Engineering;Dept. of Information and Communication Engineering;Dept. of Electrical Engineering and Computer Science;Samsung Advanced Institute of Technology", "aff_unique_url": "https://www.snu.ac.kr;http://www.chosun.ac.kr;https://www.dgist.ac.kr;https://www.sait.samsung.com", "aff_unique_abbr": "SNU;Chosun;DGIST;SAIT", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Low-Precision Stochastic Gradient Langevin Dynamics", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17245", "id": "17245", "proceeding": "https://proceedings.mlr.press/v162/zhang22ag.html", "poster": "/media/PosterPDFs/ICML%202022/d582ac40970f9885836a61d7b2c662e4.png?t=1658120792.547424", "slides": "", "author_site": "Ruqi Zhang, Andrew Wilson, Christopher De Sa", "author": "Ruqi Zhang; Andrew Gordon Wilson; Christopher De Sa", "abstract": "While low-precision optimization has been widely used to accelerate deep learning, low-precision sampling remains largely unexplored. As a consequence, sampling is simply infeasible in many large-scale scenarios, despite providing remarkable benefits to generalization and uncertainty estimation for neural networks. In this paper, we provide the first study of low-precision Stochastic Gradient Langevin Dynamics (SGLD), showing that its costs can be significantly reduced without sacrificing performance, due to its intrinsic ability to handle system noise. We prove that the convergence of low-precision SGLD with full-precision gradient accumulators is less affected by the quantization error than its SGD counterpart in the strongly convex setting. To further enable low-precision gradient accumulators, we develop a new quantization function for SGLD that preserves the variance in each update step. We demonstrate that low-precision SGLD achieves comparable performance to full-precision SGLD with only 8 bits on a variety of deep learning tasks.", "bibtex": "@InProceedings{pmlr-v162-zhang22ag,\n title = \t {Low-Precision Stochastic Gradient {L}angevin Dynamics},\n author = {Zhang, Ruqi and Wilson, Andrew Gordon and De Sa, Christopher},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26624--26644},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22ag/zhang22ag.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22ag.html},\n abstract = \t {While low-precision optimization has been widely used to accelerate deep learning, low-precision sampling remains largely unexplored. As a consequence, sampling is simply infeasible in many large-scale scenarios, despite providing remarkable benefits to generalization and uncertainty estimation for neural networks. In this paper, we provide the first study of low-precision Stochastic Gradient Langevin Dynamics (SGLD), showing that its costs can be significantly reduced without sacrificing performance, due to its intrinsic ability to handle system noise. We prove that the convergence of low-precision SGLD with full-precision gradient accumulators is less affected by the quantization error than its SGD counterpart in the strongly convex setting. To further enable low-precision gradient accumulators, we develop a new quantization function for SGLD that preserves the variance in each update step. We demonstrate that low-precision SGLD achieves comparable performance to full-precision SGLD with only 8 bits on a variety of deep learning tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22ag/zhang22ag.pdf", "supp": "", "pdf_size": 574247, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5250731865302553140&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "The University of Texas at Austin; New York University; Cornell University", "aff_domain": "utexas.edu; ; ", "email": "utexas.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhang22ag.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Texas at Austin;New York University;Cornell University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.utexas.edu;https://www.nyu.edu;https://www.cornell.edu", "aff_unique_abbr": "UT Austin;NYU;Cornell", "aff_campus_unique_index": "0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "LyaNet: A Lyapunov Framework for Training Neural ODEs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17059", "id": "17059", "proceeding": "https://proceedings.mlr.press/v162/rodriguez22a.html", "poster": "/media/PosterPDFs/ICML%202022/c778a2d8bf30ef1d3c2d6bc5696defad_mt2wV2b.png?t=1658232573.6152477", "slides": "", "author_site": "Ivan Dario Jimenez Rodriguez, Aaron Ames, Yisong Yue", "author": "Ivan Dario Jimenez Rodriguez; Aaron Ames; Yisong Yue", "abstract": "We propose a method for training ordinary differential equations by using a control-theoretic Lyapunov condition for stability. Our approach, called LyaNet, is based on a novel Lyapunov loss formulation that encourages the inference dynamics to converge quickly to the correct prediction. Theoretically, we show that minimizing Lyapunov loss guarantees exponential convergence to the correct solution and enables a novel robustness guarantee. We also provide practical algorithms, including one that avoids the cost of backpropagating through a solver or using the adjoint method. Relative to standard Neural ODE training, we empirically find that LyaNet can offer improved prediction performance, faster convergence of inference dynamics, and improved adversarial robustness. Our code is available at https://github.com/ivandariojr/LyapunovLearning.", "bibtex": "@InProceedings{pmlr-v162-rodriguez22a,\n title = \t {{L}ya{N}et: A {L}yapunov Framework for Training Neural {ODE}s},\n author = {Rodriguez, Ivan Dario Jimenez and Ames, Aaron and Yue, Yisong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18687--18703},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rodriguez22a/rodriguez22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rodriguez22a.html},\n abstract = \t {We propose a method for training ordinary differential equations by using a control-theoretic Lyapunov condition for stability. Our approach, called LyaNet, is based on a novel Lyapunov loss formulation that encourages the inference dynamics to converge quickly to the correct prediction. Theoretically, we show that minimizing Lyapunov loss guarantees exponential convergence to the correct solution and enables a novel robustness guarantee. We also provide practical algorithms, including one that avoids the cost of backpropagating through a solver or using the adjoint method. Relative to standard Neural ODE training, we empirically find that LyaNet can offer improved prediction performance, faster convergence of inference dynamics, and improved adversarial robustness. Our code is available at https://github.com/ivandariojr/LyapunovLearning.}\n}", "pdf": "https://proceedings.mlr.press/v162/rodriguez22a/rodriguez22a.pdf", "supp": "", "pdf_size": 1692683, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11176249487221195122&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computational and Mathematical Sciences, California Institute of Technology; Department of Computational and Mathematical Sciences, California Institute of Technology; Department of Computational and Mathematical Sciences, California Institute of Technology + Argo AI", "aff_domain": "caltech.edu; ; ", "email": "caltech.edu; ; ", "github": "https://github.com/ivandariojr/LyapunovLearning", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/rodriguez22a.html", "aff_unique_index": "0;0;0+1", "aff_unique_norm": "California Institute of Technology;Argo AI", "aff_unique_dep": "Department of Computational and Mathematical Sciences;", "aff_unique_url": "https://www.caltech.edu;https://www.argo.ai", "aff_unique_abbr": "Caltech;Argo AI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;0;0+0", "aff_country_unique": "United States" }, { "title": "Lyapunov Density Models: Constraining Distribution Shift in Learning-Based Control", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18219", "id": "18219", "proceeding": "https://proceedings.mlr.press/v162/kang22a.html", "poster": "", "slides": "", "author_site": "Katie Kang, Paula Gradu, Jason Choi, Michael Janner, Claire Tomlin, Sergey Levine", "author": "Katie Kang; Paula Gradu; Jason J Choi; Michael Janner; Claire Tomlin; Sergey Levine", "abstract": "Learned models and policies can generalize effectively when evaluated within the distribution of the training data, but can produce unpredictable and erroneous outputs on out-of-distribution inputs. In order to avoid distribution shift when deploying learning-based control algorithms, we seek a mechanism to constrain the agent to states and actions that resemble those that the method was trained on. In control theory, Lyapunov stability and control-invariant sets allow us to make guarantees about controllers that stabilize the system around specific states, while in machine learning, density models allow us to estimate the training data distribution. Can we combine these two concepts, producing learning-based control algorithms that constrain the system to in-distribution states using only in-distribution actions? In this paper, we propose to do this by combining concepts from Lyapunov stability and density estimation, introducing Lyapunov density models: a generalization of control Lyapunov functions and density models that provides guarantees about an agent\u2019s ability to stay in-distribution over its entire trajectory.", "bibtex": "@InProceedings{pmlr-v162-kang22a,\n title = \t {{L}yapunov Density Models: Constraining Distribution Shift in Learning-Based Control},\n author = {Kang, Katie and Gradu, Paula and Choi, Jason J and Janner, Michael and Tomlin, Claire and Levine, Sergey},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10708--10733},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kang22a/kang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kang22a.html},\n abstract = \t {Learned models and policies can generalize effectively when evaluated within the distribution of the training data, but can produce unpredictable and erroneous outputs on out-of-distribution inputs. In order to avoid distribution shift when deploying learning-based control algorithms, we seek a mechanism to constrain the agent to states and actions that resemble those that the method was trained on. In control theory, Lyapunov stability and control-invariant sets allow us to make guarantees about controllers that stabilize the system around specific states, while in machine learning, density models allow us to estimate the training data distribution. Can we combine these two concepts, producing learning-based control algorithms that constrain the system to in-distribution states using only in-distribution actions? In this paper, we propose to do this by combining concepts from Lyapunov stability and density estimation, introducing Lyapunov density models: a generalization of control Lyapunov functions and density models that provides guarantees about an agent\u2019s ability to stay in-distribution over its entire trajectory.}\n}", "pdf": "https://proceedings.mlr.press/v162/kang22a/kang22a.pdf", "supp": "", "pdf_size": 4175651, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2043695775128696507&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of California, Berkeley; University of California, Berkeley; University of California, Berkeley; University of California, Berkeley; University of California, Berkeley; University of California, Berkeley", "aff_domain": "eecs.berkeley.edu; ; ; ; ; ", "email": "eecs.berkeley.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/kang22a.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "MAE-DET: Revisiting Maximum Entropy Principle in Zero-Shot NAS for Efficient Object Detection", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16415", "id": "16415", "proceeding": "https://proceedings.mlr.press/v162/sun22c.html", "poster": "/media/PosterPDFs/ICML%202022/566f0ea4f6c2e947f36795c8f58ba901_ZIlgpI7.png?t=1655461546.3312201", "slides": "/media/icml-2022/Slides/16415_aThN57W.pdf", "author_site": "Zhenhong Sun, Ming Lin, Xiuyu Sun, Zhiyu Tan, Hao Li, rong jin", "author": "Zhenhong Sun; Ming Lin; Xiuyu Sun; Zhiyu Tan; Hao Li; Rong Jin", "abstract": "In object detection, the detection backbone consumes more than half of the overall inference cost. Recent researches attempt to reduce this cost by optimizing the backbone architecture with the help of Neural Architecture Search (NAS). However, existing NAS methods for object detection require hundreds to thousands of GPU hours of searching, making them impractical in fast-paced research and development. In this work, we propose a novel zero-shot NAS method to address this issue. The proposed method, named MAE-DET, automatically designs efficient detection backbones via the Maximum Entropy Principle without training network parameters, reducing the architecture design cost to nearly zero yet delivering the state-of-the-art (SOTA) performance. Under the hood, MAE-DET maximizes the differential entropy of detection backbones, leading to a better feature extractor for object detection under the same computational budgets. After merely one GPU day of fully automatic design, MAE-DET innovates SOTA detection backbones on multiple detection benchmark datasets with little human intervention. Comparing to ResNet-50 backbone, MAE-DET is $+2.0%$ better in mAP when using the same amount of FLOPs/parameters, and is $1.54$ times faster on NVIDIA V100 at the same mAP. Code and pre-trained models are available here (https://github.com/alibaba/lightweight-neural-architecture-search).", "bibtex": "@InProceedings{pmlr-v162-sun22c,\n title = \t {{MAE}-{DET}: Revisiting Maximum Entropy Principle in Zero-Shot {NAS} for Efficient Object Detection},\n author = {Sun, Zhenhong and Lin, Ming and Sun, Xiuyu and Tan, Zhiyu and Li, Hao and Jin, Rong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20810--20826},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sun22c/sun22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/sun22c.html},\n abstract = \t {In object detection, the detection backbone consumes more than half of the overall inference cost. Recent researches attempt to reduce this cost by optimizing the backbone architecture with the help of Neural Architecture Search (NAS). However, existing NAS methods for object detection require hundreds to thousands of GPU hours of searching, making them impractical in fast-paced research and development. In this work, we propose a novel zero-shot NAS method to address this issue. The proposed method, named MAE-DET, automatically designs efficient detection backbones via the Maximum Entropy Principle without training network parameters, reducing the architecture design cost to nearly zero yet delivering the state-of-the-art (SOTA) performance. Under the hood, MAE-DET maximizes the differential entropy of detection backbones, leading to a better feature extractor for object detection under the same computational budgets. After merely one GPU day of fully automatic design, MAE-DET innovates SOTA detection backbones on multiple detection benchmark datasets with little human intervention. Comparing to ResNet-50 backbone, MAE-DET is $+2.0%$ better in mAP when using the same amount of FLOPs/parameters, and is $1.54$ times faster on NVIDIA V100 at the same mAP. Code and pre-trained models are available here (https://github.com/alibaba/lightweight-neural-architecture-search).}\n}", "pdf": "https://proceedings.mlr.press/v162/sun22c/sun22c.pdf", "supp": "", "pdf_size": 14023838, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6294936790423161188&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Alibaba Group; Alibaba Group; Alibaba Group; Alibaba Group; Alibaba Group; Alibaba Group", "aff_domain": "alibaba-inc.com; ; ; ; ; ", "email": "alibaba-inc.com; ; ; ; ; ", "github": "https://github.com/alibaba/lightweight-neural-architecture-search", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/sun22c.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "MAML and ANIL Provably Learn Representations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17109", "id": "17109", "proceeding": "https://proceedings.mlr.press/v162/collins22a.html", "poster": "/media/PosterPDFs/ICML%202022/50adecfc746426ca10973a067421d0bf.png?t=1658165451.8889148", "slides": "", "author_site": "Liam Collins, Aryan Mokhtari, Sewoong Oh, Sanjay Shakkottai", "author": "Liam Collins; Aryan Mokhtari; Sewoong Oh; Sanjay Shakkottai", "abstract": "Recent empirical evidence has driven conventional wisdom to believe that gradient-based meta-learning (GBML) methods perform well at few-shot learning because they learn an expressive data representation that is shared across tasks. However, the mechanics of GBML have remained largely mysterious from a theoretical perspective. In this paper, we prove that two well-known GBML methods, MAML and ANIL, as well as their first-order approximations, are capable of learning common representation among a set of given tasks. Specifically, in the well-known multi-task linear representation learning setting, they are able to recover the ground-truth representation at an exponentially fast rate. Moreover, our analysis illuminates that the driving force causing MAML and ANIL to recover the underlying representation is that they adapt the final layer of their model, which harnesses the underlying task diversity to improve the representation in all directions of interest. To the best of our knowledge, these are the first results to show that MAML and/or ANIL learn expressive representations and to rigorously explain why they do so.", "bibtex": "@InProceedings{pmlr-v162-collins22a,\n title = \t {{MAML} and {ANIL} Provably Learn Representations},\n author = {Collins, Liam and Mokhtari, Aryan and Oh, Sewoong and Shakkottai, Sanjay},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4238--4310},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/collins22a/collins22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/collins22a.html},\n abstract = \t {Recent empirical evidence has driven conventional wisdom to believe that gradient-based meta-learning (GBML) methods perform well at few-shot learning because they learn an expressive data representation that is shared across tasks. However, the mechanics of GBML have remained largely mysterious from a theoretical perspective. In this paper, we prove that two well-known GBML methods, MAML and ANIL, as well as their first-order approximations, are capable of learning common representation among a set of given tasks. Specifically, in the well-known multi-task linear representation learning setting, they are able to recover the ground-truth representation at an exponentially fast rate. Moreover, our analysis illuminates that the driving force causing MAML and ANIL to recover the underlying representation is that they adapt the final layer of their model, which harnesses the underlying task diversity to improve the representation in all directions of interest. To the best of our knowledge, these are the first results to show that MAML and/or ANIL learn expressive representations and to rigorously explain why they do so.}\n}", "pdf": "https://proceedings.mlr.press/v162/collins22a/collins22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/collins22a-supp.zip", "pdf_size": 3983863, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=771595913734981383&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Electrical and Computer Engineering, The University of Texas at Austin; Department of Electrical and Computer Engineering, The University of Texas at Austin; School of Computer Science and Engineering, University of Washington; Department of Electrical and Computer Engineering, The University of Texas at Austin", "aff_domain": "utexas.edu; ; ; ", "email": "utexas.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/collins22a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Texas at Austin;University of Washington", "aff_unique_dep": "Department of Electrical and Computer Engineering;School of Computer Science and Engineering", "aff_unique_url": "https://www.utexas.edu;https://www.washington.edu", "aff_unique_abbr": "UT Austin;UW", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Austin;Seattle", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "MASER: Multi-Agent Reinforcement Learning with Subgoals Generated from Experience Replay Buffer", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17579", "id": "17579", "proceeding": "https://proceedings.mlr.press/v162/jeon22a.html", "poster": "/media/PosterPDFs/ICML%202022/89db09d856d45d361982edc10ce738a2.png?t=1657163262.7884789", "slides": "", "author_site": "JEON JEEWON, WOOJUN KIM, Whiyoung Jung, Youngchul Sung", "author": "Jeewon Jeon; Woojun Kim; Whiyoung Jung; Youngchul Sung", "abstract": "In this paper, we consider cooperative multi-agent reinforcement learning (MARL) with sparse reward. To tackle this problem, we propose a novel method named MASER: MARL with subgoals generated from experience replay buffer. Under the widely-used assumption of centralized training with decentralized execution and consistent Q-value decomposition for MARL, MASER automatically generates proper subgoals for multiple agents from the experience replay buffer by considering both individual Q-value and total Q-value. Then, MASER designs individual intrinsic reward for each agent based on actionable representation relevant to Q-learning so that the agents reach their subgoals while maximizing the joint action value. Numerical results show that MASER significantly outperforms StarCraft II micromanagement benchmark compared to other state-of-the-art MARL algorithms.", "bibtex": "@InProceedings{pmlr-v162-jeon22a,\n title = \t {{MASER}: Multi-Agent Reinforcement Learning with Subgoals Generated from Experience Replay Buffer},\n author = {Jeon, Jeewon and Kim, Woojun and Jung, Whiyoung and Sung, Youngchul},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10041--10052},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jeon22a/jeon22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jeon22a.html},\n abstract = \t {In this paper, we consider cooperative multi-agent reinforcement learning (MARL) with sparse reward. To tackle this problem, we propose a novel method named MASER: MARL with subgoals generated from experience replay buffer. Under the widely-used assumption of centralized training with decentralized execution and consistent Q-value decomposition for MARL, MASER automatically generates proper subgoals for multiple agents from the experience replay buffer by considering both individual Q-value and total Q-value. Then, MASER designs individual intrinsic reward for each agent based on actionable representation relevant to Q-learning so that the agents reach their subgoals while maximizing the joint action value. Numerical results show that MASER significantly outperforms StarCraft II micromanagement benchmark compared to other state-of-the-art MARL algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/jeon22a/jeon22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/jeon22a-supp.zip", "pdf_size": 3730780, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3511041100939657281&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "School of Electrical Engineering, KAIST, Daejeon, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/jeon22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "KAIST", "aff_unique_dep": "School of Electrical Engineering", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Daejeon", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "ME-GAN: Learning Panoptic Electrocardio Representations for Multi-view ECG Synthesis Conditioned on Heart Diseases", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16179", "id": "16179", "proceeding": "https://proceedings.mlr.press/v162/chen22n.html", "poster": "", "slides": "", "author_site": "Jintai Chen, KuanLun Liao, Kun Wei, Haochao Ying, Danny Z Chen, Jian Wu", "author": "Jintai Chen; Kuanlun Liao; Kun Wei; Haochao Ying; Danny Z Chen; Jian Wu", "abstract": "Electrocardiogram (ECG) is a widely used non-invasive diagnostic tool for heart diseases. Many studies have devised ECG analysis models (e.g., classifiers) to assist diagnosis. As an upstream task, researches have built generative models to synthesize ECG data, which are beneficial to providing training samples, privacy protection, and annotation reduction. However, previous generative methods for ECG often neither synthesized multi-view data, nor dealt with heart disease conditions. In this paper, we propose a novel disease-aware generative adversarial network for multi-view ECG synthesis called ME-GAN, which attains panoptic electrocardio representations conditioned on heart diseases and projects the representations onto multiple standard views to yield ECG signals. Since ECG manifestations of heart diseases are often localized in specific waveforms, we propose a new \"mixup normalization\" to inject disease information precisely into suitable locations. In addition, we propose a \"view discriminator\" to revert disordered ECG views into a pre-determined order, supervising the generator to obtain ECG representing correct view characteristics. Besides, a new metric, rFID, is presented to assess the quality of the synthesized ECG signals. Comprehensive experiments verify that our ME-GAN performs well on multi-view ECG signal synthesis with trusty morbid manifestations.", "bibtex": "@InProceedings{pmlr-v162-chen22n,\n title = \t {{ME}-{GAN}: Learning Panoptic Electrocardio Representations for Multi-view {ECG} Synthesis Conditioned on Heart Diseases},\n author = {Chen, Jintai and Liao, Kuanlun and Wei, Kun and Ying, Haochao and Chen, Danny Z and Wu, Jian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3360--3370},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22n/chen22n.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22n.html},\n abstract = \t {Electrocardiogram (ECG) is a widely used non-invasive diagnostic tool for heart diseases. Many studies have devised ECG analysis models (e.g., classifiers) to assist diagnosis. As an upstream task, researches have built generative models to synthesize ECG data, which are beneficial to providing training samples, privacy protection, and annotation reduction. However, previous generative methods for ECG often neither synthesized multi-view data, nor dealt with heart disease conditions. In this paper, we propose a novel disease-aware generative adversarial network for multi-view ECG synthesis called ME-GAN, which attains panoptic electrocardio representations conditioned on heart diseases and projects the representations onto multiple standard views to yield ECG signals. Since ECG manifestations of heart diseases are often localized in specific waveforms, we propose a new \"mixup normalization\" to inject disease information precisely into suitable locations. In addition, we propose a \"view discriminator\" to revert disordered ECG views into a pre-determined order, supervising the generator to obtain ECG representing correct view characteristics. Besides, a new metric, rFID, is presented to assess the quality of the synthesized ECG signals. Comprehensive experiments verify that our ME-GAN performs well on multi-view ECG signal synthesis with trusty morbid manifestations.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22n/chen22n.pdf", "supp": "", "pdf_size": 2717655, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3766193897010337858&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "College of Computer Science and Technology, Zhejiang University, Hangzhou 310058, China; College of Computer Science and Technology, Zhejiang University, Hangzhou 310058, China; School of Electronic Engineering, Xidian University, Xi\u2019an 710071, China; School of Public Health, Zhejiang University, Hangzhou, China + The Key Laboratory of Intelligent Preventive Medicine of Zhejiang Province, Hangzhou, Zhejiang 310058, China; Department of Computer Science and Engineering, University of Notre Dame, Notre Dame, IN 46556, USA; The Second Affiliated Hospital School of Medicine, School of Public Health, and Institute of Wenzhou, Zhejiang University, Hangzhou 310058, China", "aff_domain": "zju.edu.cn; ; ; ; ; ", "email": "zju.edu.cn; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/chen22n.html", "aff_unique_index": "0;0;1;0+2;3;0", "aff_unique_norm": "Zhejiang University;Xidian University;Zhejiang Province Key Laboratory of Intelligent Preventive Medicine;University of Notre Dame", "aff_unique_dep": "College of Computer Science and Technology;School of Electronic Engineering;Intelligent Preventive Medicine;Department of Computer Science and Engineering", "aff_unique_url": "http://www.zju.edu.cn;http://www.xidian.edu.cn;;https://www.nd.edu", "aff_unique_abbr": "ZJU;Xidian;;Notre Dame", "aff_campus_unique_index": "0;0;1;0+0;2;0", "aff_campus_unique": "Hangzhou;Xi'an;Notre Dame", "aff_country_unique_index": "0;0;0;0+0;1;0", "aff_country_unique": "China;United States" }, { "title": "Making Linear MDPs Practical via Contrastive Representation Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17101", "id": "17101", "proceeding": "https://proceedings.mlr.press/v162/zhang22x.html", "poster": "/media/PosterPDFs/ICML%202022/0b9e57c46de934cee33b0e8d1839bfc2_piOclv8.png?t=1657649814.212089", "slides": "", "author_site": "Tianjun Zhang, Tongzheng Ren, Mengjiao Yang, Joseph E Gonzalez, Dale Schuurmans, Bo Dai", "author": "Tianjun Zhang; Tongzheng Ren; Mengjiao Yang; Joseph Gonzalez; Dale Schuurmans; Bo Dai", "abstract": "It is common to address the curse of dimensionality in Markov decision processes (MDPs) by exploiting low-rank representations. This motivates much of the recent theoretical study on linear MDPs. However, most approaches require a given representation under unrealistic assumptions about the normalization of the decomposition or introduce unresolved computational challenges in practice. Instead, we consider an alternative definition of linear MDPs that automatically ensures normalization while allowing efficient representation learning via contrastive estimation. The framework also admits confidence-adjusted index algorithms, enabling an efficient and principled approach to incorporating optimism or pessimism in the face of uncertainty. To the best of our knowledge, this provides the first practical representation learning method for linear MDPs that achieves both strong theoretical guarantees and empirical performance. Theoretically, we prove that the proposed algorithm is sample efficient in both the online and offline settings. Empirically, we demonstrate superior performance over existing state-of-the-art model-based and model-free algorithms on several benchmarks.", "bibtex": "@InProceedings{pmlr-v162-zhang22x,\n title = \t {Making Linear {MDP}s Practical via Contrastive Representation Learning},\n author = {Zhang, Tianjun and Ren, Tongzheng and Yang, Mengjiao and Gonzalez, Joseph and Schuurmans, Dale and Dai, Bo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26447--26466},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22x/zhang22x.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22x.html},\n abstract = \t {It is common to address the curse of dimensionality in Markov decision processes (MDPs) by exploiting low-rank representations. This motivates much of the recent theoretical study on linear MDPs. However, most approaches require a given representation under unrealistic assumptions about the normalization of the decomposition or introduce unresolved computational challenges in practice. Instead, we consider an alternative definition of linear MDPs that automatically ensures normalization while allowing efficient representation learning via contrastive estimation. The framework also admits confidence-adjusted index algorithms, enabling an efficient and principled approach to incorporating optimism or pessimism in the face of uncertainty. To the best of our knowledge, this provides the first practical representation learning method for linear MDPs that achieves both strong theoretical guarantees and empirical performance. Theoretically, we prove that the proposed algorithm is sample efficient in both the online and offline settings. Empirically, we demonstrate superior performance over existing state-of-the-art model-based and model-free algorithms on several benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22x/zhang22x.pdf", "supp": "", "pdf_size": 1485794, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2931409266511454539&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "UC Berkeley; UT Austin+Google Brain; UC Berkeley+Google Brain; UC Berkeley; Google Brain+University of Alberta; Google Brain", "aff_domain": "berkeley.edu;utexas.edu; ;berkeley.edu; ;google.com", "email": "berkeley.edu;utexas.edu; ;berkeley.edu; ;google.com", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhang22x.html", "aff_unique_index": "0;1+2;0+2;0;2+3;2", "aff_unique_norm": "University of California, Berkeley;University of Texas at Austin;Google;University of Alberta", "aff_unique_dep": ";;Google Brain;", "aff_unique_url": "https://www.berkeley.edu;https://www.utexas.edu;https://brain.google.com;https://www.ualberta.ca", "aff_unique_abbr": "UC Berkeley;UT Austin;Google Brain;UAlberta", "aff_campus_unique_index": "0;1+2;0+2;0;2;2", "aff_campus_unique": "Berkeley;Austin;Mountain View;", "aff_country_unique_index": "0;0+0;0+0;0;0+1;0", "aff_country_unique": "United States;Canada" }, { "title": "Marginal Distribution Adaptation for Discrete Sets via Module-Oriented Divergence Minimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17265", "id": "17265", "proceeding": "https://proceedings.mlr.press/v162/dai22c.html", "poster": "", "slides": "", "author_site": "Hanjun Dai, Mengjiao Yang, Yuan Xue, Dale Schuurmans, Bo Dai", "author": "Hanjun Dai; Mengjiao Yang; Yuan Xue; Dale Schuurmans; Bo Dai", "abstract": "Distributions over discrete sets capture the essential statistics including the high-order correlation among elements. Such information provides powerful insight for decision making across various application domains, e.g., product assortment based on product distribution in shopping carts. While deep generative models trained on pre-collected data can capture existing distributions, such pre-trained models are usually not capable of aligning with a target domain in the presence of distribution shift due to reasons such as temporal shift or the change in the population mix. We develop a general framework to adapt a generative model subject to a (possibly counterfactual) target data distribution with both sampling and computation efficiency. Concretely, instead of re-training a full model from scratch, we reuse the learned modules to preserve the correlations between set elements, while only adjusting corresponding components to align with target marginal constraints. We instantiate the approach for three commonly used forms of discrete set distribution\u2014latent variable, autoregressive, and energy based models\u2014and provide efficient solutions for marginal-constrained optimization in either primal or dual forms. Experiments on both synthetic and real-world e-commerce and EHR datasets show that the proposed framework is able to practically align a generative model to match marginal constraints under distribution shift.", "bibtex": "@InProceedings{pmlr-v162-dai22c,\n title = \t {Marginal Distribution Adaptation for Discrete Sets via Module-Oriented Divergence Minimization},\n author = {Dai, Hanjun and Yang, Mengjiao and Xue, Yuan and Schuurmans, Dale and Dai, Bo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4605--4617},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dai22c/dai22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/dai22c.html},\n abstract = \t {Distributions over discrete sets capture the essential statistics including the high-order correlation among elements. Such information provides powerful insight for decision making across various application domains, e.g., product assortment based on product distribution in shopping carts. While deep generative models trained on pre-collected data can capture existing distributions, such pre-trained models are usually not capable of aligning with a target domain in the presence of distribution shift due to reasons such as temporal shift or the change in the population mix. We develop a general framework to adapt a generative model subject to a (possibly counterfactual) target data distribution with both sampling and computation efficiency. Concretely, instead of re-training a full model from scratch, we reuse the learned modules to preserve the correlations between set elements, while only adjusting corresponding components to align with target marginal constraints. We instantiate the approach for three commonly used forms of discrete set distribution\u2014latent variable, autoregressive, and energy based models\u2014and provide efficient solutions for marginal-constrained optimization in either primal or dual forms. Experiments on both synthetic and real-world e-commerce and EHR datasets show that the proposed framework is able to practically align a generative model to match marginal constraints under distribution shift.}\n}", "pdf": "https://proceedings.mlr.press/v162/dai22c/dai22c.pdf", "supp": "", "pdf_size": 1406816, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14336561605718998895&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Google Research, Brain Team; Google Research, Brain Team; Google Cloud; Google Research, Brain Team; Google Research, Brain Team", "aff_domain": "google.com; ; ; ; ", "email": "google.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/dai22c.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Marginal Tail-Adaptive Normalizing Flows", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18185", "id": "18185", "proceeding": "https://proceedings.mlr.press/v162/laszkiewicz22a.html", "poster": "/media/PosterPDFs/ICML%202022/b27d5296bede63b1493a5d321d4e8092.png?t=1657551383.2341893", "slides": "", "author_site": "Mike Laszkiewicz, Johannes Lederer, Asja Fischer", "author": "Mike Laszkiewicz; Johannes Lederer; Asja Fischer", "abstract": "Learning the tail behavior of a distribution is a notoriously difficult problem. By definition, the number of samples from the tail is small, and deep generative models, such as normalizing flows, tend to concentrate on learning the body of the distribution. In this paper, we focus on improving the ability of normalizing flows to correctly capture the tail behavior and, thus, form more accurate models. We prove that the marginal tailedness of an autoregressive flow can be controlled via the tailedness of the marginals of its base distribution. This theoretical insight leads us to a novel type of flows based on flexible base distributions and data-driven linear layers. An empirical analysis shows that the proposed method improves on the accuracy{\u2014}especially on the tails of the distribution{\u2014}and is able to generate heavy-tailed data. We demonstrate its application on a weather and climate example, in which capturing the tail behavior is essential.", "bibtex": "@InProceedings{pmlr-v162-laszkiewicz22a,\n title = \t {Marginal Tail-Adaptive Normalizing Flows},\n author = {Laszkiewicz, Mike and Lederer, Johannes and Fischer, Asja},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12020--12048},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/laszkiewicz22a/laszkiewicz22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/laszkiewicz22a.html},\n abstract = \t {Learning the tail behavior of a distribution is a notoriously difficult problem. By definition, the number of samples from the tail is small, and deep generative models, such as normalizing flows, tend to concentrate on learning the body of the distribution. In this paper, we focus on improving the ability of normalizing flows to correctly capture the tail behavior and, thus, form more accurate models. We prove that the marginal tailedness of an autoregressive flow can be controlled via the tailedness of the marginals of its base distribution. This theoretical insight leads us to a novel type of flows based on flexible base distributions and data-driven linear layers. An empirical analysis shows that the proposed method improves on the accuracy{\u2014}especially on the tails of the distribution{\u2014}and is able to generate heavy-tailed data. We demonstrate its application on a weather and climate example, in which capturing the tail behavior is essential.}\n}", "pdf": "https://proceedings.mlr.press/v162/laszkiewicz22a/laszkiewicz22a.pdf", "supp": "", "pdf_size": 2245267, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3241792279775112520&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Faculty of Mathematics, Ruhr University, Bochum, Germany+Center of Computer Science, Bochum, Germany; Faculty of Mathematics, Ruhr University, Bochum, Germany; Center of Computer Science, Bochum, Germany", "aff_domain": "rub.de; ; ", "email": "rub.de; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/laszkiewicz22a.html", "aff_unique_index": "0+0;0;0", "aff_unique_norm": "Ruhr University Bochum", "aff_unique_dep": "Faculty of Mathematics", "aff_unique_url": "https://www.ruhr-uni-bochum.de", "aff_unique_abbr": "RUB", "aff_campus_unique_index": "0+0;0;0", "aff_campus_unique": "Bochum", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "Germany" }, { "title": "Markov Chain Monte Carlo for Continuous-Time Switching Dynamical Systems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18051", "id": "18051", "proceeding": "https://proceedings.mlr.press/v162/kohs22a.html", "poster": "/media/PosterPDFs/ICML%202022/75c58d36157505a600e0695ed0b3a22d.png?t=1657535605.2341044", "slides": "", "author_site": "Lukas K\u00f6hs, Bastian Alt, Heinz Koeppl", "author": "Lukas K\u00f6hs; Bastian Alt; Heinz Koeppl", "abstract": "Switching dynamical systems are an expressive model class for the analysis of time-series data. As in many fields within the natural and engineering sciences, the systems under study typically evolve continuously in time, it is natural to consider continuous-time model formulations consisting of switching stochastic differential equations governed by an underlying Markov jump process. Inference in these types of models is however notoriously difficult, and tractable computational schemes are rare. In this work, we propose a novel inference algorithm utilizing a Markov Chain Monte Carlo approach. The presented Gibbs sampler allows to efficiently obtain samples from the exact continuous-time posterior processes. Our framework naturally enables Bayesian parameter estimation, and we also include an estimate for the diffusion covariance, which is oftentimes assumed fixed in stochastic differential equations models. We evaluate our framework under the modeling assumption and compare it against an existing variational inference approach.", "bibtex": "@InProceedings{pmlr-v162-kohs22a,\n title = \t {{M}arkov Chain {M}onte {C}arlo for Continuous-Time Switching Dynamical Systems},\n author = {K{\\\"o}hs, Lukas and Alt, Bastian and Koeppl, Heinz},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11430--11454},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kohs22a/kohs22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kohs22a.html},\n abstract = \t {Switching dynamical systems are an expressive model class for the analysis of time-series data. As in many fields within the natural and engineering sciences, the systems under study typically evolve continuously in time, it is natural to consider continuous-time model formulations consisting of switching stochastic differential equations governed by an underlying Markov jump process. Inference in these types of models is however notoriously difficult, and tractable computational schemes are rare. In this work, we propose a novel inference algorithm utilizing a Markov Chain Monte Carlo approach. The presented Gibbs sampler allows to efficiently obtain samples from the exact continuous-time posterior processes. Our framework naturally enables Bayesian parameter estimation, and we also include an estimate for the diffusion covariance, which is oftentimes assumed fixed in stochastic differential equations models. We evaluate our framework under the modeling assumption and compare it against an existing variational inference approach.}\n}", "pdf": "https://proceedings.mlr.press/v162/kohs22a/kohs22a.pdf", "supp": "", "pdf_size": 1641757, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7273453605320947194&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Electrical Engineering and Information Technology, Technische Universit\u00e4t Darmstadt; Department of Electrical Engineering and Information Technology, Technische Universit\u00e4t Darmstadt; Department of Electrical Engineering and Information Technology, Technische Universit\u00e4t Darmstadt", "aff_domain": "tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de", "email": "tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kohs22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "Department of Electrical Engineering and Information Technology", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TUD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Maslow\u2019s Hammer in Catastrophic Forgetting: Node Re-Use vs. Node Activation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16951", "id": "16951", "proceeding": "https://proceedings.mlr.press/v162/lee22g.html", "poster": "/media/PosterPDFs/ICML%202022/bb1443cc31d7396bf73e7858cea114e1.png?t=1657441041.2105331", "slides": "/media/icml-2022/Slides/16951.pdf", "author_site": "Sebastian Lee, Stefano Sarao Mannelli, Claudia Clopath, Sebastian Goldt, Andrew Saxe", "author": "Sebastian Lee; Stefano Sarao Mannelli; Claudia Clopath; Sebastian Goldt; Andrew Saxe", "abstract": "Continual learning\u2014learning new tasks in sequence while maintaining performance on old tasks\u2014remains particularly challenging for artificial neural networks. Surprisingly, the amount of forgetting does not increase with the dissimilarity between the learned tasks, but appears to be worst in an intermediate similarity regime. In this paper we theoretically analyse both a synthetic teacher-student framework and a real data setup to provide an explanation of this phenomenon that we name Maslow\u2019s Hammer hypothesis. Our analysis reveals the presence of a trade-off between node activation and node re-use that results in worst forgetting in the intermediate regime. Using this understanding we reinterpret popular algorithmic interventions for catastrophic interference in terms of this trade-off, and identify the regimes in which they are most effective.", "bibtex": "@InProceedings{pmlr-v162-lee22g,\n title = \t {Maslow\u2019s Hammer in Catastrophic Forgetting: Node Re-Use vs. Node Activation},\n author = {Lee, Sebastian and Mannelli, Stefano Sarao and Clopath, Claudia and Goldt, Sebastian and Saxe, Andrew},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12455--12477},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lee22g/lee22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/lee22g.html},\n abstract = \t {Continual learning\u2014learning new tasks in sequence while maintaining performance on old tasks\u2014remains particularly challenging for artificial neural networks. Surprisingly, the amount of forgetting does not increase with the dissimilarity between the learned tasks, but appears to be worst in an intermediate similarity regime. In this paper we theoretically analyse both a synthetic teacher-student framework and a real data setup to provide an explanation of this phenomenon that we name Maslow\u2019s Hammer hypothesis. Our analysis reveals the presence of a trade-off between node activation and node re-use that results in worst forgetting in the intermediate regime. Using this understanding we reinterpret popular algorithmic interventions for catastrophic interference in terms of this trade-off, and identify the regimes in which they are most effective.}\n}", "pdf": "https://proceedings.mlr.press/v162/lee22g/lee22g.pdf", "supp": "", "pdf_size": 19223130, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3374238946346765384&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Imperial College, London, UK+Sainsbury Wellcome Centre, UCL+Gatsby Computational Neuroscience Unit, UCL; Sainsbury Wellcome Centre, UCL+Gatsby Computational Neuroscience Unit, UCL; Imperial College, London, UK+Sainsbury Wellcome Centre, UCL; International School of Advanced Studies (SISSA), Trieste, Italy; Sainsbury Wellcome Centre, UCL+Gatsby Computational Neuroscience Unit, UCL+CIFAR Azrieli Global Scholars program, CIFAR, Toronto, Canada", "aff_domain": "imperial.ac.uk; ; ; ;ucl.ac.uk", "email": "imperial.ac.uk; ; ; ;ucl.ac.uk", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/lee22g.html", "aff_unique_index": "0+1+1;1+1;0+1;2;1+1+3", "aff_unique_norm": "Imperial College London;University College London;International School of Advanced Studies;CIFAR", "aff_unique_dep": ";Sainsbury Wellcome Centre;;Azrieli Global Scholars program", "aff_unique_url": "https://www.imperial.ac.uk;https://www.ucl.ac.uk;https://www.sissa.it;https://www.cifar.ca", "aff_unique_abbr": "ICL;UCL;SISSA;CIFAR", "aff_campus_unique_index": "0;;0;2;3", "aff_campus_unique": "London;;Trieste;Toronto", "aff_country_unique_index": "0+0+0;0+0;0+0;1;0+0+2", "aff_country_unique": "United Kingdom;Italy;Canada" }, { "title": "Massively Parallel $k$-Means Clustering for Perturbation Resilient Instances", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16001", "id": "16001", "proceeding": "https://proceedings.mlr.press/v162/cohen-addad22b.html", "poster": "/media/PosterPDFs/ICML%202022/3a2ee1496acdeeece4124055b3799c33_Blf2fri.png?t=1657866902.6164582", "slides": "/media/icml-2022/Slides/16001_KRRxiub.pdf", "author_site": "Vincent Cohen-Addad, Vahab Mirrokni, Peilin Zhong", "author": "Vincent Cohen-Addad; Vahab Mirrokni; Peilin Zhong", "abstract": "We consider $k$-means clustering of $n$ data points in Euclidean space in the Massively Parallel Computation (MPC) model, a computational model which is an abstraction of modern massively parallel computing system such as MapReduce. Recent work provides evidence that getting $O(1)$-approximate $k$-means solution for general input points using $o(\\log n)$ rounds in the MPC model may be impossible under certain conditions [Ghaffari, Kuhn \\& Uitto\u20192019]. However, the real-world data points usually have better structures. One instance of interest is the set of data points which is perturbation resilient [Bilu \\& Linial\u20192010]. In particular, a point set is $\\alpha$-perturbation resilient for $k$-means if perturbing pairwise distances by multiplicative factors in the range $[1,\\alpha]$ does not change the optimum $k$-means clusters. We bypass the worst case lower bound by considering the perturbation resilient input points and showing $o(\\log n)$ rounds $k$-means clustering algorithms for these instances in the MPC model. Specifically, we show a fully scalable $(1+\\varepsilon)$-approximate $k$-means clustering algorithm for $O(\\alpha)$-perturbation resilient instance in the MPC model using $O(1)$ rounds and ${O}_{\\varepsilon,d}(n^{1+1/\\alpha^2+o(1)})$ total space. If the space per machine is sufficiently larger than $k$, i.e., at least $k\\cdot n^{\\Omega(1)}$, we also develop an optimal $k$-means clustering algorithm for $O(\\alpha)$-perturbation resilient instance in MPC using $O(1)$ rounds and ${O}_d(n^{1+o(1)}\\cdot(n^{1/\\alpha^2}+k))$ total space.", "bibtex": "@InProceedings{pmlr-v162-cohen-addad22b,\n title = \t {Massively Parallel $k$-Means Clustering for Perturbation Resilient Instances},\n author = {Cohen-Addad, Vincent and Mirrokni, Vahab and Zhong, Peilin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4180--4201},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cohen-addad22b/cohen-addad22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/cohen-addad22b.html},\n abstract = \t {We consider $k$-means clustering of $n$ data points in Euclidean space in the Massively Parallel Computation (MPC) model, a computational model which is an abstraction of modern massively parallel computing system such as MapReduce. Recent work provides evidence that getting $O(1)$-approximate $k$-means solution for general input points using $o(\\log n)$ rounds in the MPC model may be impossible under certain conditions [Ghaffari, Kuhn \\& Uitto\u20192019]. However, the real-world data points usually have better structures. One instance of interest is the set of data points which is perturbation resilient [Bilu \\& Linial\u20192010]. In particular, a point set is $\\alpha$-perturbation resilient for $k$-means if perturbing pairwise distances by multiplicative factors in the range $[1,\\alpha]$ does not change the optimum $k$-means clusters. We bypass the worst case lower bound by considering the perturbation resilient input points and showing $o(\\log n)$ rounds $k$-means clustering algorithms for these instances in the MPC model. Specifically, we show a fully scalable $(1+\\varepsilon)$-approximate $k$-means clustering algorithm for $O(\\alpha)$-perturbation resilient instance in the MPC model using $O(1)$ rounds and ${O}_{\\varepsilon,d}(n^{1+1/\\alpha^2+o(1)})$ total space. If the space per machine is sufficiently larger than $k$, i.e., at least $k\\cdot n^{\\Omega(1)}$, we also develop an optimal $k$-means clustering algorithm for $O(\\alpha)$-perturbation resilient instance in MPC using $O(1)$ rounds and ${O}_d(n^{1+o(1)}\\cdot(n^{1/\\alpha^2}+k))$ total space.}\n}", "pdf": "https://proceedings.mlr.press/v162/cohen-addad22b/cohen-addad22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/cohenaddad22b-supp.zip", "pdf_size": 517248, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6857814007515879942&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Google Research; Google Research; Google Research", "aff_domain": "google.com;google.com;google.com", "email": "google.com;google.com;google.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/cohen-addad22b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Matching Learned Causal Effects of Neural Networks with Domain Priors", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18299", "id": "18299", "proceeding": "https://proceedings.mlr.press/v162/kancheti22a.html", "poster": "/media/PosterPDFs/ICML%202022/ebb87faa733d9a04ebb40f422b135fb0_SUl6WkQ.png?t=1657644202.6535277", "slides": "/media/icml-2022/Slides/18299.pdf", "author_site": "Sai Srinivas Kancheti, Gowtham Reddy Abbavaram, Vineeth N Balasubramanian, Amit Sharma", "author": "Sai Srinivas Kancheti; Abbavaram Gowtham Reddy; Vineeth N Balasubramanian; Amit Sharma", "abstract": "A trained neural network can be interpreted as a structural causal model (SCM) that provides the effect of changing input variables on the model\u2019s output. However, if training data contains both causal and correlational relationships, a model that optimizes prediction accuracy may not necessarily learn the true causal relationships between input and output variables. On the other hand, expert users often have prior knowledge of the causal relationship between certain input variables and output from domain knowledge. Therefore, we propose a regularization method that aligns the learned causal effects of a neural network with domain priors, including both direct and total causal effects. We show that this approach can generalize to different kinds of domain priors, including monotonicity of causal effect of an input variable on output or zero causal effect of a variable on output for purposes of fairness. Our experiments on twelve benchmark datasets show its utility in regularizing a neural network model to maintain desired causal effects, without compromising on accuracy. Importantly, we also show that a model thus trained is robust and gets improved accuracy on noisy inputs.", "bibtex": "@InProceedings{pmlr-v162-kancheti22a,\n title = \t {Matching Learned Causal Effects of Neural Networks with Domain Priors},\n author = {Kancheti, Sai Srinivas and Reddy, Abbavaram Gowtham and Balasubramanian, Vineeth N and Sharma, Amit},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10676--10696},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kancheti22a/kancheti22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kancheti22a.html},\n abstract = \t {A trained neural network can be interpreted as a structural causal model (SCM) that provides the effect of changing input variables on the model\u2019s output. However, if training data contains both causal and correlational relationships, a model that optimizes prediction accuracy may not necessarily learn the true causal relationships between input and output variables. On the other hand, expert users often have prior knowledge of the causal relationship between certain input variables and output from domain knowledge. Therefore, we propose a regularization method that aligns the learned causal effects of a neural network with domain priors, including both direct and total causal effects. We show that this approach can generalize to different kinds of domain priors, including monotonicity of causal effect of an input variable on output or zero causal effect of a variable on output for purposes of fairness. Our experiments on twelve benchmark datasets show its utility in regularizing a neural network model to maintain desired causal effects, without compromising on accuracy. Importantly, we also show that a model thus trained is robust and gets improved accuracy on noisy inputs.}\n}", "pdf": "https://proceedings.mlr.press/v162/kancheti22a/kancheti22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/kancheti22a-supp.zip", "pdf_size": 11217801, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6089251092049651005&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Indian Institute of Technology Hyderabad; Indian Institute of Technology Hyderabad; Indian Institute of Technology Hyderabad; Microsoft Research, Bangalore", "aff_domain": "iith.ac.in; ; ; ", "email": "iith.ac.in; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/kancheti22a.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Indian Institute of Technology Hyderabad;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.iith.ac.in;https://www.microsoft.com/en-us/research/group/bangalore", "aff_unique_abbr": "IIT Hyderabad;MSR", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Hyderabad;Bangalore", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "India" }, { "title": "Matching Normalizing Flows and Probability Paths on Manifolds", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17243", "id": "17243", "proceeding": "https://proceedings.mlr.press/v162/ben-hamu22a.html", "poster": "/media/PosterPDFs/ICML%202022/fea9c11c4ad9a395a636ed944a28b51a.png?t=1658005007.1565158", "slides": "", "author_site": "Heli Ben-Hamu, samuel cohen, Joey Bose, Brandon Amos, Maximilian Nickel, Aditya Grover, Ricky T. Q. Chen, Yaron Lipman", "author": "Heli Ben-Hamu; Samuel Cohen; Joey Bose; Brandon Amos; Maximillian Nickel; Aditya Grover; Ricky T. Q. Chen; Yaron Lipman", "abstract": "Continuous Normalizing Flows (CNFs) are a class of generative models that transform a prior distribution to a model distribution by solving an ordinary differential equation (ODE). We propose to train CNFs on manifolds by minimizing probability path divergence (PPD), a novel family of divergences between the probability density path generated by the CNF and a target probability density path. PPD is formulated using a logarithmic mass conservation formula which is a linear first order partial differential equation relating the log target probabilities and the CNF\u2019s defining vector field. PPD has several key benefits over existing methods: it sidesteps the need to solve an ODE per iteration, readily applies to manifold data, scales to high dimensions, and is compatible with a large family of target paths interpolating pure noise and data in finite time. Theoretically, PPD is shown to bound classical probability divergences. Empirically, we show that CNFs learned by minimizing PPD achieve state-of-the-art results in likelihoods and sample quality on existing low-dimensional manifold benchmarks, and is the first example of a generative model to scale to moderately high dimensional manifolds.", "bibtex": "@InProceedings{pmlr-v162-ben-hamu22a,\n title = \t {Matching Normalizing Flows and Probability Paths on Manifolds},\n author = {Ben-Hamu, Heli and Cohen, Samuel and Bose, Joey and Amos, Brandon and Nickel, Maximillian and Grover, Aditya and Chen, Ricky T. Q. and Lipman, Yaron},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1749--1763},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ben-hamu22a/ben-hamu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ben-hamu22a.html},\n abstract = \t {Continuous Normalizing Flows (CNFs) are a class of generative models that transform a prior distribution to a model distribution by solving an ordinary differential equation (ODE). We propose to train CNFs on manifolds by minimizing probability path divergence (PPD), a novel family of divergences between the probability density path generated by the CNF and a target probability density path. PPD is formulated using a logarithmic mass conservation formula which is a linear first order partial differential equation relating the log target probabilities and the CNF\u2019s defining vector field. PPD has several key benefits over existing methods: it sidesteps the need to solve an ODE per iteration, readily applies to manifold data, scales to high dimensions, and is compatible with a large family of target paths interpolating pure noise and data in finite time. Theoretically, PPD is shown to bound classical probability divergences. Empirically, we show that CNFs learned by minimizing PPD achieve state-of-the-art results in likelihoods and sample quality on existing low-dimensional manifold benchmarks, and is the first example of a generative model to scale to moderately high dimensional manifolds.}\n}", "pdf": "https://proceedings.mlr.press/v162/ben-hamu22a/ben-hamu22a.pdf", "supp": "", "pdf_size": 9275750, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=264025871177527927&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Weizmann Institute of Science; Meta AI Research+Centre for Arti\ufb01cial Intelligence, University College London; Meta AI Research; Meta AI Research; Meta AI Research; Meta AI Research; Meta AI Research; Weizmann Institute of Science+Meta AI Research", "aff_domain": "weizmann.ac.il; ; ; ; ; ; ; ", "email": "weizmann.ac.il; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/ben-hamu22a.html", "aff_unique_index": "0;1+2;1;1;1;1;1;0+1", "aff_unique_norm": "Weizmann Institute of Science;Meta;University College London", "aff_unique_dep": ";Meta AI Research;Centre for Arti\ufb01cial Intelligence", "aff_unique_url": "https://www.weizmann.org.il;https://meta.com;https://www.ucl.ac.uk", "aff_unique_abbr": "Weizmann;Meta AI;UCL", "aff_campus_unique_index": "1;", "aff_campus_unique": ";London", "aff_country_unique_index": "0;1+2;1;1;1;1;1;0+1", "aff_country_unique": "Israel;United States;United Kingdom" }, { "title": "Matching Structure for Dual Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16005", "id": "16005", "proceeding": "https://proceedings.mlr.press/v162/fei22a.html", "poster": "/media/PosterPDFs/ICML%202022/4558dbb6f6f8bb2e16d03b85bde76e2c_qFdtrym.png?t=1657278017.4155908", "slides": "", "author_site": "Hao Fei, Shengqiong Wu, Yafeng Ren, Meishan Zhang", "author": "Hao Fei; Shengqiong Wu; Yafeng Ren; Meishan Zhang", "abstract": "Many natural language processing (NLP) tasks appear in dual forms, which are generally solved by dual learning technique that models the dualities between the coupled tasks. In this work, we propose to further enhance dual learning with structure matching that explicitly builds structural connections in between. Starting with the dual text$\\leftrightarrow$text generation, we perform dually-syntactic structure co-echoing of the region of interest (RoI) between the task pair, together with a syntax cross-reconstruction at the decoding side. We next extend the idea to a text$\\leftrightarrow$non-text setup, making alignment between the syntactic-semantic structure. Over 2*14 tasks covering 5 dual learning scenarios, the proposed structure matching method shows its significant effectiveness in enhancing existing dual learning. Our method can retrieve the key RoIs that are highly crucial to the task performance. Besides NLP tasks, it is also revealed that our approach has great potential in facilitating more non-text$\\leftrightarrow$non-text scenarios.", "bibtex": "@InProceedings{pmlr-v162-fei22a,\n title = \t {Matching Structure for Dual Learning},\n author = {Fei, Hao and Wu, Shengqiong and Ren, Yafeng and Zhang, Meishan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6373--6391},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fei22a/fei22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/fei22a.html},\n abstract = \t {Many natural language processing (NLP) tasks appear in dual forms, which are generally solved by dual learning technique that models the dualities between the coupled tasks. In this work, we propose to further enhance dual learning with structure matching that explicitly builds structural connections in between. Starting with the dual text$\\leftrightarrow$text generation, we perform dually-syntactic structure co-echoing of the region of interest (RoI) between the task pair, together with a syntax cross-reconstruction at the decoding side. We next extend the idea to a text$\\leftrightarrow$non-text setup, making alignment between the syntactic-semantic structure. Over 2*14 tasks covering 5 dual learning scenarios, the proposed structure matching method shows its significant effectiveness in enhancing existing dual learning. Our method can retrieve the key RoIs that are highly crucial to the task performance. Besides NLP tasks, it is also revealed that our approach has great potential in facilitating more non-text$\\leftrightarrow$non-text scenarios.}\n}", "pdf": "https://proceedings.mlr.press/v162/fei22a/fei22a.pdf", "supp": "", "pdf_size": 815446, "gs_citation": 123, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13178178322802879803&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "School of Computing, National University of Singapore, Singapore+Sea-NExT Joint Lab, Singapore; School of Computing, National University of Singapore, Singapore; School of Interpreting and Translation Studies, Guangdong University of Foreign Studies, China; Institute of Computing and Intelligence, Harbin Institute of Technology (Shenzhen), China", "aff_domain": "whu.edu.cn;gmail.com; ; ", "email": "whu.edu.cn;gmail.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/fei22a.html", "aff_unique_index": "0+1;0;2;3", "aff_unique_norm": "National University of Singapore;Sea-NExT Joint Lab;Guangdong University of Foreign Studies;Harbin Institute of Technology", "aff_unique_dep": "School of Computing;;School of Interpreting and Translation Studies;Institute of Computing and Intelligence", "aff_unique_url": "https://www.nus.edu.sg;;;http://www.hhit.edu.cn", "aff_unique_abbr": "NUS;;;HIT", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0+0;0;1;1", "aff_country_unique": "Singapore;China" }, { "title": "Maximum Likelihood Training for Score-based Diffusion ODEs by High Order Denoising Score Matching", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17683", "id": "17683", "proceeding": "https://proceedings.mlr.press/v162/lu22f.html", "poster": "/media/PosterPDFs/ICML%202022/246a3c5544feb054f3ea718f61adfa16.png?t=1656760530.545212", "slides": "/media/icml-2022/Slides/17683.pdf", "author_site": "Cheng Lu, Kaiwen Zheng, Fan Bao, Jianfei Chen, Chongxuan Li, Jun Zhu", "author": "Cheng Lu; Kaiwen Zheng; Fan Bao; Jianfei Chen; Chongxuan Li; Jun Zhu", "abstract": "Score-based generative models have excellent performance in terms of generation quality and likelihood. They model the data distribution by matching a parameterized score network with first-order data score functions. The score network can be used to define an ODE (\u201cscore-based diffusion ODE\u201d) for exact likelihood evaluation. However, the relationship between the likelihood of the ODE and the score matching objective is unclear. In this work, we prove that matching the first-order score is not sufficient to maximize the likelihood of the ODE, by showing a gap between the maximum likelihood and score matching objectives. To fill up this gap, we show that the negative likelihood of the ODE can be bounded by controlling the first, second, and third-order score matching errors; and we further present a novel high-order denoising score matching method to enable maximum likelihood training of score-based diffusion ODEs. Our algorithm guarantees that the higher-order matching error is bounded by the training error and the lower-order errors. We empirically observe that by high-order score matching, score-based diffusion ODEs achieve better likelihood on both synthetic data and CIFAR-10, while retaining the high generation quality.", "bibtex": "@InProceedings{pmlr-v162-lu22f,\n title = \t {Maximum Likelihood Training for Score-based Diffusion {ODE}s by High Order Denoising Score Matching},\n author = {Lu, Cheng and Zheng, Kaiwen and Bao, Fan and Chen, Jianfei and Li, Chongxuan and Zhu, Jun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14429--14460},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lu22f/lu22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/lu22f.html},\n abstract = \t {Score-based generative models have excellent performance in terms of generation quality and likelihood. They model the data distribution by matching a parameterized score network with first-order data score functions. The score network can be used to define an ODE (\u201cscore-based diffusion ODE\u201d) for exact likelihood evaluation. However, the relationship between the likelihood of the ODE and the score matching objective is unclear. In this work, we prove that matching the first-order score is not sufficient to maximize the likelihood of the ODE, by showing a gap between the maximum likelihood and score matching objectives. To fill up this gap, we show that the negative likelihood of the ODE can be bounded by controlling the first, second, and third-order score matching errors; and we further present a novel high-order denoising score matching method to enable maximum likelihood training of score-based diffusion ODEs. Our algorithm guarantees that the higher-order matching error is bounded by the training error and the lower-order errors. We empirically observe that by high-order score matching, score-based diffusion ODEs achieve better likelihood on both synthetic data and CIFAR-10, while retaining the high generation quality.}\n}", "pdf": "https://proceedings.mlr.press/v162/lu22f/lu22f.pdf", "supp": "", "pdf_size": 1595088, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2275057454258549935&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/lu22f.html" }, { "title": "Meaningfully debugging model mistakes using conceptual counterfactual explanations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16721", "id": "16721", "proceeding": "https://proceedings.mlr.press/v162/abid22a.html", "poster": "", "slides": "", "author_site": "Abubakar Abid, Mert Yuksekgonul, James Zou", "author": "Abubakar Abid; Mert Yuksekgonul; James Zou", "abstract": "Understanding and explaining the mistakes made by trained models is critical to many machine learning objectives, such as improving robustness, addressing concept drift, and mitigating biases. However, this is often an ad hoc process that involves manually looking at the model\u2019s mistakes on many test samples and guessing at the underlying reasons for those incorrect predictions. In this paper, we propose a systematic approach, conceptual counterfactual explanations (CCE), that explains why a classifier makes a mistake on a particular test sample(s) in terms of human-understandable concepts (e.g. this zebra is misclassified as a dog because of faint stripes). We base CCE on two prior ideas: counterfactual explanations and concept activation vectors, and validate our approach on well-known pretrained models, showing that it explains the models\u2019 mistakes meaningfully. In addition, for new models trained on data with spurious correlations, CCE accurately identifies the spurious correlation as the cause of model mistakes from a single misclassified test sample. On two challenging medical applications, CCE generated useful insights, confirmed by clinicians, into biases and mistakes the model makes in real-world settings. The code for CCE is publicly available and can easily be applied to explain mistakes in new models.", "bibtex": "@InProceedings{pmlr-v162-abid22a,\n title = \t {Meaningfully debugging model mistakes using conceptual counterfactual explanations},\n author = {Abid, Abubakar and Yuksekgonul, Mert and Zou, James},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {66--88},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/abid22a/abid22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/abid22a.html},\n abstract = \t {Understanding and explaining the mistakes made by trained models is critical to many machine learning objectives, such as improving robustness, addressing concept drift, and mitigating biases. However, this is often an ad hoc process that involves manually looking at the model\u2019s mistakes on many test samples and guessing at the underlying reasons for those incorrect predictions. In this paper, we propose a systematic approach, conceptual counterfactual explanations (CCE), that explains why a classifier makes a mistake on a particular test sample(s) in terms of human-understandable concepts (e.g. this zebra is misclassified as a dog because of faint stripes). We base CCE on two prior ideas: counterfactual explanations and concept activation vectors, and validate our approach on well-known pretrained models, showing that it explains the models\u2019 mistakes meaningfully. In addition, for new models trained on data with spurious correlations, CCE accurately identifies the spurious correlation as the cause of model mistakes from a single misclassified test sample. On two challenging medical applications, CCE generated useful insights, confirmed by clinicians, into biases and mistakes the model makes in real-world settings. The code for CCE is publicly available and can easily be applied to explain mistakes in new models.}\n}", "pdf": "https://proceedings.mlr.press/v162/abid22a/abid22a.pdf", "supp": "", "pdf_size": 5397000, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2849569429175172034&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Electrical Engineering, Stanford University; Department of Computer Science, Stanford University; Department of Biomedical Data Science, Stanford University", "aff_domain": "stanford.edu; ; ", "email": "stanford.edu; ; ", "github": "https://github.com/mertyg/debug-mistakes-cce", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/abid22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Measure Estimation in the Barycentric Coding Model", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16617", "id": "16617", "proceeding": "https://proceedings.mlr.press/v162/werenski22a.html", "poster": "/media/PosterPDFs/ICML%202022/fc2022c89b61c76bbef978f1370660bf.png?t=1658251818.912428", "slides": "", "author_site": "Matthew Werenski, Ruijie Jiang, Abiy Tasissa, Shuchin Aeron, James Murphy", "author": "Matthew Werenski; Ruijie Jiang; Abiy Tasissa; Shuchin Aeron; James M Murphy", "abstract": "This paper considers the problem of measure estimation under the barycentric coding model (BCM), in which an unknown measure is assumed to belong to the set of Wasserstein-2 barycenters of a finite set of known measures. Estimating a measure under this model is equivalent to estimating the unknown barycentric coordinates. We provide novel geometrical, statistical, and computational insights for measure estimation under the BCM, consisting of three main results. Our first main result leverages the Riemannian geometry of Wasserstein-2 space to provide a procedure for recovering the barycentric coordinates as the solution to a quadratic optimization problem assuming access to the true reference measures. The essential geometric insight is that the parameters of this quadratic problem are determined by inner products between the optimal displacement maps from the given measure to the reference measures defining the BCM. Our second main result then establishes an algorithm for solving for the coordinates in the BCM when all the measures are observed empirically via i.i.d. samples. We prove precise rates of convergence for this algorithm\u2014determined by the smoothness of the underlying measures and their dimensionality\u2014thereby guaranteeing its statistical consistency. Finally, we demonstrate the utility of the BCM and associated estimation procedures in three application areas: (i) covariance estimation for Gaussian measures; (ii) image processing; and (iii) natural language processing.", "bibtex": "@InProceedings{pmlr-v162-werenski22a,\n title = \t {Measure Estimation in the Barycentric Coding Model},\n author = {Werenski, Matthew and Jiang, Ruijie and Tasissa, Abiy and Aeron, Shuchin and Murphy, James M},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23781--23803},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/werenski22a/werenski22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/werenski22a.html},\n abstract = \t {This paper considers the problem of measure estimation under the barycentric coding model (BCM), in which an unknown measure is assumed to belong to the set of Wasserstein-2 barycenters of a finite set of known measures. Estimating a measure under this model is equivalent to estimating the unknown barycentric coordinates. We provide novel geometrical, statistical, and computational insights for measure estimation under the BCM, consisting of three main results. Our first main result leverages the Riemannian geometry of Wasserstein-2 space to provide a procedure for recovering the barycentric coordinates as the solution to a quadratic optimization problem assuming access to the true reference measures. The essential geometric insight is that the parameters of this quadratic problem are determined by inner products between the optimal displacement maps from the given measure to the reference measures defining the BCM. Our second main result then establishes an algorithm for solving for the coordinates in the BCM when all the measures are observed empirically via i.i.d. samples. We prove precise rates of convergence for this algorithm\u2014determined by the smoothness of the underlying measures and their dimensionality\u2014thereby guaranteeing its statistical consistency. Finally, we demonstrate the utility of the BCM and associated estimation procedures in three application areas: (i) covariance estimation for Gaussian measures; (ii) image processing; and (iii) natural language processing.}\n}", "pdf": "https://proceedings.mlr.press/v162/werenski22a/werenski22a.pdf", "supp": "", "pdf_size": 579260, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3529680784651732155&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Tufts University; Department of Electrical and Computer Engineering, Tufts University; Department of Mathematics, Tufts University; Department of Electrical and Computer Engineering, Tufts University; Department of Mathematics, Tufts University", "aff_domain": "tufts.edu; ; ; ; ", "email": "tufts.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/werenski22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tufts University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.tufts.edu", "aff_unique_abbr": "Tufts", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Measuring Representational Robustness of Neural Networks Through Shared Invariances", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18387", "id": "18387", "proceeding": "https://proceedings.mlr.press/v162/nanda22a.html", "poster": "/media/PosterPDFs/ICML%202022/5291822d0636dc429e80e953c58b6a76.png?t=1657806942.274593", "slides": "", "author_site": "Vedant Nanda, Till Speicher, Camila Kolling, John P Dickerson, Krishna Gummadi, Adrian Weller", "author": "Vedant Nanda; Till Speicher; Camila Kolling; John P Dickerson; Krishna Gummadi; Adrian Weller", "abstract": "A major challenge in studying robustness in deep learning is defining the set of \u201cmeaningless\u201d perturbations to which a given Neural Network (NN) should be invariant. Most work on robustness implicitly uses a human as the reference model to define such perturbations. Our work offers a new view on robustness by using another reference NN to define the set of perturbations a given NN should be invariant to, thus generalizing the reliance on a reference \u201chuman NN\u201d to any NN. This makes measuring robustness equivalent to measuring the extent to which two NNs share invariances. We propose a measure called \\stir, which faithfully captures the extent to which two NNs share invariances. \\stir re-purposes existing representation similarity measures to make them suitable for measuring shared invariances. Using our measure, we are able to gain insights about how shared invariances vary with changes in weight initialization, architecture, loss functions, and training dataset. Our implementation is available at: \\url{https://github.com/nvedant07/STIR}.", "bibtex": "@InProceedings{pmlr-v162-nanda22a,\n title = \t {Measuring Representational Robustness of Neural Networks Through Shared Invariances},\n author = {Nanda, Vedant and Speicher, Till and Kolling, Camila and Dickerson, John P and Gummadi, Krishna and Weller, Adrian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16368--16382},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nanda22a/nanda22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nanda22a.html},\n abstract = \t {A major challenge in studying robustness in deep learning is defining the set of \u201cmeaningless\u201d perturbations to which a given Neural Network (NN) should be invariant. Most work on robustness implicitly uses a human as the reference model to define such perturbations. Our work offers a new view on robustness by using another reference NN to define the set of perturbations a given NN should be invariant to, thus generalizing the reliance on a reference \u201chuman NN\u201d to any NN. This makes measuring robustness equivalent to measuring the extent to which two NNs share invariances. We propose a measure called \\stir, which faithfully captures the extent to which two NNs share invariances. \\stir re-purposes existing representation similarity measures to make them suitable for measuring shared invariances. Using our measure, we are able to gain insights about how shared invariances vary with changes in weight initialization, architecture, loss functions, and training dataset. Our implementation is available at: \\url{https://github.com/nvedant07/STIR}.}\n}", "pdf": "https://proceedings.mlr.press/v162/nanda22a/nanda22a.pdf", "supp": "", "pdf_size": 684125, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11535296107699738994&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of Maryland, College Park+Max Planck Institute for Software Systems; Max Planck Institute for Software Systems; PUCRS; University of Maryland, College Park; Max Planck Institute for Software Systems; University of Cambridge+The Alan Turing Institute", "aff_domain": "cs.umd.edu; ; ; ; ; ", "email": "cs.umd.edu; ; ; ; ; ", "github": "https://github.com/nvedant07/STIR", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/nanda22a.html", "aff_unique_index": "0+1;1;2;0;1;3+4", "aff_unique_norm": "University of Maryland;Max Planck Institute for Software Systems;Pontif\u00edcia Universidade Cat\u00f3lica do Rio Grande do Sul;University of Cambridge;Alan Turing Institute", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www/umd.edu;https://www.mpi-sws.org;https://www.pucrs.br;https://www.cam.ac.uk;https://www.turing.ac.uk", "aff_unique_abbr": "UMD;MPI-SWS;PUCRS;Cambridge;ATI", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "College Park;;Cambridge", "aff_country_unique_index": "0+1;1;2;0;1;3+3", "aff_country_unique": "United States;Germany;Brazil;United Kingdom" }, { "title": "Measuring dissimilarity with diffeomorphism invariance", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17587", "id": "17587", "proceeding": "https://proceedings.mlr.press/v162/cantelobre22a.html", "poster": "/media/PosterPDFs/ICML%202022/9ff0525c64bf3d4c9957a1d4397f1b40.png?t=1657722342.0634925", "slides": "/media/icml-2022/Slides/17587.pdf", "author_site": "Th\u00e9ophile Cantelobre, Carlo Ciliberto, Benjamin Guedj, Alessandro Rudi", "author": "Th\u00e9ophile Cantelobre; Carlo Ciliberto; Benjamin Guedj; Alessandro Rudi", "abstract": "Measures of similarity (or dissimilarity) are a key ingredient to many machine learning algorithms. We introduce DID, a pairwise dissimilarity measure applicable to a wide range of data spaces, which leverages the data\u2019s internal structure to be invariant to diffeomorphisms. We prove that DID enjoys properties which make it relevant for theoretical study and practical use. By representing each datum as a function, DID is defined as the solution to an optimization problem in a Reproducing Kernel Hilbert Space and can be expressed in closed-form. In practice, it can be efficiently approximated via Nystr{\u00f6}m sampling. Empirical experiments support the merits of DID.", "bibtex": "@InProceedings{pmlr-v162-cantelobre22a,\n title = \t {Measuring dissimilarity with diffeomorphism invariance},\n author = {Cantelobre, Th{\\'e}ophile and Ciliberto, Carlo and Guedj, Benjamin and Rudi, Alessandro},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2572--2596},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cantelobre22a/cantelobre22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/cantelobre22a.html},\n abstract = \t {Measures of similarity (or dissimilarity) are a key ingredient to many machine learning algorithms. We introduce DID, a pairwise dissimilarity measure applicable to a wide range of data spaces, which leverages the data\u2019s internal structure to be invariant to diffeomorphisms. We prove that DID enjoys properties which make it relevant for theoretical study and practical use. By representing each datum as a function, DID is defined as the solution to an optimization problem in a Reproducing Kernel Hilbert Space and can be expressed in closed-form. In practice, it can be efficiently approximated via Nystr{\u00f6}m sampling. Empirical experiments support the merits of DID.}\n}", "pdf": "https://proceedings.mlr.press/v162/cantelobre22a/cantelobre22a.pdf", "supp": "", "pdf_size": 7800814, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9356741545436506583&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "DI ENS, Ecole normale sup\u00e9rieure, Universit\u00e9 PSL, CNRS, Inria, Paris, France+Inria London, UK; Centre for AI, Department of Computer Science, University College London, UK; Inria London, UK+Inria, Lille - Nord Europe Research Centre, Lille, France; DI ENS, Ecole normale sup\u00e9rieure, Universit\u00e9 PSL, CNRS, Inria, Paris, France", "aff_domain": "inria.fr;ucl.ac.uk;inria.fr;inria.fr", "email": "inria.fr;ucl.ac.uk;inria.fr;inria.fr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/cantelobre22a.html", "aff_unique_index": "0+1;2;1+1;0", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure;INRIA;University College London", "aff_unique_dep": "DI ENS;;Department of Computer Science", "aff_unique_url": "https://www.ens.fr;https://www.inria.fr/en;https://www.ucl.ac.uk", "aff_unique_abbr": "ENS;Inria;UCL", "aff_campus_unique_index": "0+1;1;1+2;0", "aff_campus_unique": "Paris;London;Lille - Nord Europe", "aff_country_unique_index": "0+1;1;1+0;0", "aff_country_unique": "France;United Kingdom" }, { "title": "Measuring the Effect of Training Data on Deep Learning Predictions via Randomized Experiments", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16815", "id": "16815", "proceeding": "https://proceedings.mlr.press/v162/lin22h.html", "poster": "/media/PosterPDFs/ICML%202022/c4b31ce7d95c75ca70d50c19aef08bf1.png?t=1658011011.5312316", "slides": "", "author_site": "Jinkun Lin, Anqi Zhang, Mathias L\u00e9cuyer, Jinyang Li, Aurojit Panda, Siddhartha Sen", "author": "Jinkun Lin; Anqi Zhang; Mathias L\u00e9cuyer; Jinyang Li; Aurojit Panda; Siddhartha Sen", "abstract": "We develop a new, principled algorithm for estimating the contribution of training data points to the behavior of a deep learning model, such as a specific prediction it makes. Our algorithm estimates the AME, a quantity that measures the expected (average) marginal effect of adding a data point to a subset of the training data, sampled from a given distribution. When subsets are sampled from the uniform distribution, the AME reduces to the well-known Shapley value. Our approach is inspired by causal inference and randomized experiments: we sample different subsets of the training data to train multiple submodels, and evaluate each submodel\u2019s behavior. We then use a LASSO regression to jointly estimate the AME of each data point, based on the subset compositions. Under sparsity assumptions ($k \\ll N$ datapoints have large AME), our estimator requires only $O(k\\log N)$ randomized submodel trainings, improving upon the best prior Shapley value estimators.", "bibtex": "@InProceedings{pmlr-v162-lin22h,\n title = \t {Measuring the Effect of Training Data on Deep Learning Predictions via Randomized Experiments},\n author = {Lin, Jinkun and Zhang, Anqi and L{\\'e}cuyer, Mathias and Li, Jinyang and Panda, Aurojit and Sen, Siddhartha},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13468--13504},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lin22h/lin22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/lin22h.html},\n abstract = \t {We develop a new, principled algorithm for estimating the contribution of training data points to the behavior of a deep learning model, such as a specific prediction it makes. Our algorithm estimates the AME, a quantity that measures the expected (average) marginal effect of adding a data point to a subset of the training data, sampled from a given distribution. When subsets are sampled from the uniform distribution, the AME reduces to the well-known Shapley value. Our approach is inspired by causal inference and randomized experiments: we sample different subsets of the training data to train multiple submodels, and evaluate each submodel\u2019s behavior. We then use a LASSO regression to jointly estimate the AME of each data point, based on the subset compositions. Under sparsity assumptions ($k \\ll N$ datapoints have large AME), our estimator requires only $O(k\\log N)$ randomized submodel trainings, improving upon the best prior Shapley value estimators.}\n}", "pdf": "https://proceedings.mlr.press/v162/lin22h/lin22h.pdf", "supp": "", "pdf_size": 5553685, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7808395865683583052&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, New York University, New York, NY; Department of Computer Science, New York University, New York, NY; University of British Columbia, Vancouver, Canada; Department of Computer Science, New York University, New York, NY; Department of Computer Science, New York University, New York, NY; Microsoft Research, New York, NY", "aff_domain": "nyu.edu; ;ubc.ca; ; ;microsoft.com", "email": "nyu.edu; ;ubc.ca; ; ;microsoft.com", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/lin22h.html", "aff_unique_index": "0;0;1;0;0;2", "aff_unique_norm": "New York University;University of British Columbia;Microsoft", "aff_unique_dep": "Department of Computer Science;;Microsoft Research", "aff_unique_url": "https://www.nyu.edu;https://www.ubc.ca;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "NYU;UBC;MSR", "aff_campus_unique_index": "0;0;1;0;0;0", "aff_campus_unique": "New York;Vancouver", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "United States;Canada" }, { "title": "MemSR: Training Memory-efficient Lightweight Model for Image Super-Resolution", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16085", "id": "16085", "proceeding": "https://proceedings.mlr.press/v162/wu22f.html", "poster": "/media/PosterPDFs/ICML%202022/7fd804295ef7f6a2822bf4c61f9dc4a8.png?t=1656059712.684093", "slides": "/media/icml-2022/Slides/16085_CAvKgor.pdf", "author_site": "Kailu Wu, Chung-Kuei Lee, Kaisheng Ma", "author": "Kailu Wu; Chung-Kuei Lee; Kaisheng Ma", "abstract": "Methods based on deep neural networks with a massive number of layers and skip-connections have made impressive improvements on single image super-resolution (SISR). The skip-connections in these complex models boost the performance at the cost of a large amount of memory. With the increase of camera resolution from 1 million pixels to 100 million pixels on mobile phones, the memory footprint of these algorithms also increases hundreds of times, which restricts the applicability of these models on memory-limited devices. A plain model consisting of a stack of 3{\\texttimes}3 convolutions with ReLU, in contrast, has the highest memory efficiency but poorly performs on super-resolution. This paper aims at calculating a winning initialization from a complex teacher network for a plain student network, which can provide performance comparable to complex models. To this end, we convert the teacher model to an equivalent large plain model and derive the plain student\u2019s initialization. We further improve the student\u2019s performance through initialization-aware feature distillation. Extensive experiments suggest that the proposed method results in a model with a competitive trade-off between accuracy and speed at a much lower memory footprint than other state-of-the-art lightweight approaches.", "bibtex": "@InProceedings{pmlr-v162-wu22f,\n title = \t {{M}em{SR}: Training Memory-efficient Lightweight Model for Image Super-Resolution},\n author = {Wu, Kailu and Lee, Chung-Kuei and Ma, Kaisheng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24076--24092},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22f/wu22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22f.html},\n abstract = \t {Methods based on deep neural networks with a massive number of layers and skip-connections have made impressive improvements on single image super-resolution (SISR). The skip-connections in these complex models boost the performance at the cost of a large amount of memory. With the increase of camera resolution from 1 million pixels to 100 million pixels on mobile phones, the memory footprint of these algorithms also increases hundreds of times, which restricts the applicability of these models on memory-limited devices. A plain model consisting of a stack of 3{\\texttimes}3 convolutions with ReLU, in contrast, has the highest memory efficiency but poorly performs on super-resolution. This paper aims at calculating a winning initialization from a complex teacher network for a plain student network, which can provide performance comparable to complex models. To this end, we convert the teacher model to an equivalent large plain model and derive the plain student\u2019s initialization. We further improve the student\u2019s performance through initialization-aware feature distillation. Extensive experiments suggest that the proposed method results in a model with a competitive trade-off between accuracy and speed at a much lower memory footprint than other state-of-the-art lightweight approaches.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22f/wu22f.pdf", "supp": "", "pdf_size": 1524364, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15252416883542999380&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, China; HiSilicon Technologies, Shanghai, China; Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, China", "aff_domain": "tsinghua.edu.cn; ;tsinghua.edu.cn", "email": "tsinghua.edu.cn; ;tsinghua.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wu22f.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Tsinghua University;HiSilicon Technologies", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.hisilicon.com", "aff_unique_abbr": "Tsinghua;", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Beijing;Shanghai", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Memory-Based Model Editing at Scale", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17533", "id": "17533", "proceeding": "https://proceedings.mlr.press/v162/mitchell22a.html", "poster": "/media/PosterPDFs/ICML%202022/c133fb1bb634af68c5088f3438848bfd.png?t=1658082070.610856", "slides": "", "author_site": "Eric Mitchell, Charles Lin, Antoine Bosselut, Christopher Manning, Chelsea Finn", "author": "Eric Mitchell; Charles Lin; Antoine Bosselut; Christopher D Manning; Chelsea Finn", "abstract": "Even the largest neural networks make errors, and once-correct predictions can become invalid as the world changes. Model editors make local updates to the behavior of base (pre-trained) models to inject updated knowledge or correct undesirable behaviors. Existing model editors have shown promise, but also suffer from insufficient expressiveness: they struggle to accurately model an edit\u2019s intended scope (examples affected by the edit), leading to inaccurate predictions for test inputs loosely related to the edit, and they often fail altogether after many edits. As a higher-capacity alternative, we propose Semi-Parametric Editing with a Retrieval-Augmented Counterfactual Model (SERAC), which stores edits in an explicit memory and learns to reason over them to modulate the base model\u2019s predictions as needed. To enable more rigorous evaluation of model editors, we introduce three challenging language model editing problems based on question answering, fact-checking, and dialogue generation. We find that only SERAC achieves high performance on all three problems, consistently outperforming existing approaches to model editing by a significant margin. Code, data, and additional project information will be made available at https://sites.google.com/view/serac-editing.", "bibtex": "@InProceedings{pmlr-v162-mitchell22a,\n title = \t {Memory-Based Model Editing at Scale},\n author = {Mitchell, Eric and Lin, Charles and Bosselut, Antoine and Manning, Christopher D and Finn, Chelsea},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15817--15831},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mitchell22a/mitchell22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mitchell22a.html},\n abstract = \t {Even the largest neural networks make errors, and once-correct predictions can become invalid as the world changes. Model editors make local updates to the behavior of base (pre-trained) models to inject updated knowledge or correct undesirable behaviors. Existing model editors have shown promise, but also suffer from insufficient expressiveness: they struggle to accurately model an edit\u2019s intended scope (examples affected by the edit), leading to inaccurate predictions for test inputs loosely related to the edit, and they often fail altogether after many edits. As a higher-capacity alternative, we propose Semi-Parametric Editing with a Retrieval-Augmented Counterfactual Model (SERAC), which stores edits in an explicit memory and learns to reason over them to modulate the base model\u2019s predictions as needed. To enable more rigorous evaluation of model editors, we introduce three challenging language model editing problems based on question answering, fact-checking, and dialogue generation. We find that only SERAC achieves high performance on all three problems, consistently outperforming existing approaches to model editing by a significant margin. Code, data, and additional project information will be made available at https://sites.google.com/view/serac-editing.}\n}", "pdf": "https://proceedings.mlr.press/v162/mitchell22a/mitchell22a.pdf", "supp": "", "pdf_size": 1449428, "gs_citation": 368, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16532462957428224688&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Stanford University Department of Computer Science; Stanford University Department of Computer Science; EPFL School of Computer and Communication Sciences; Stanford University Department of Computer Science; Stanford University Department of Computer Science", "aff_domain": "cs.stanford.edu; ; ; ; ", "email": "cs.stanford.edu; ; ; ; ", "github": "", "project": "https://sites.google.com/view/serac-editing", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/mitchell22a.html", "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Stanford University;Ecole Polytechnique Federale de Lausanne", "aff_unique_dep": "Department of Computer Science;School of Computer and Communication Sciences", "aff_unique_url": "https://www.stanford.edu;https://www.epfl.ch", "aff_unique_abbr": "Stanford;EPFL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;Switzerland" }, { "title": "MetAug: Contrastive Learning via Meta Feature Augmentation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17317", "id": "17317", "proceeding": "https://proceedings.mlr.press/v162/li22r.html", "poster": "/media/PosterPDFs/ICML%202022/233f1dd0f3f537bcb7a338ea74d63483.png?t=1656435246.3388681", "slides": "/media/icml-2022/Slides/17317.pdf", "author_site": "Jiangmeng Li, Wenwen Qiang, Changwen Zheng, Bing Su, Hui Xiong", "author": "Jiangmeng Li; Wenwen Qiang; Changwen Zheng; Bing Su; Hui Xiong", "abstract": "What matters for contrastive learning? We argue that contrastive learning heavily relies on informative features, or \u201chard\u201d (positive or negative) features. Early works include more informative features by applying complex data augmentations and large batch size or memory bank, and recent works design elaborate sampling approaches to explore informative features. The key challenge toward exploring such features is that the source multi-view data is generated by applying random data augmentations, making it infeasible to always add useful information in the augmented data. Consequently, the informativeness of features learned from such augmented data is limited. In response, we propose to directly augment the features in latent space, thereby learning discriminative representations without a large amount of input data. We perform a meta learning technique to build the augmentation generator that updates its network parameters by considering the performance of the encoder. However, insufficient input data may lead the encoder to learn collapsed features and therefore malfunction the augmentation generator. A new margin-injected regularization is further added in the objective function to avoid the encoder learning a degenerate mapping. To contrast all features in one gradient back-propagation step, we adopt the proposed optimization-driven unified contrastive loss instead of the conventional contrastive loss. Empirically, our method achieves state-of-the-art results on several benchmark datasets.", "bibtex": "@InProceedings{pmlr-v162-li22r,\n title = \t {{M}et{A}ug: Contrastive Learning via Meta Feature Augmentation},\n author = {Li, Jiangmeng and Qiang, Wenwen and Zheng, Changwen and Su, Bing and Xiong, Hui},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12964--12978},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22r/li22r.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22r.html},\n abstract = \t {What matters for contrastive learning? We argue that contrastive learning heavily relies on informative features, or \u201chard\u201d (positive or negative) features. Early works include more informative features by applying complex data augmentations and large batch size or memory bank, and recent works design elaborate sampling approaches to explore informative features. The key challenge toward exploring such features is that the source multi-view data is generated by applying random data augmentations, making it infeasible to always add useful information in the augmented data. Consequently, the informativeness of features learned from such augmented data is limited. In response, we propose to directly augment the features in latent space, thereby learning discriminative representations without a large amount of input data. We perform a meta learning technique to build the augmentation generator that updates its network parameters by considering the performance of the encoder. However, insufficient input data may lead the encoder to learn collapsed features and therefore malfunction the augmentation generator. A new margin-injected regularization is further added in the objective function to avoid the encoder learning a degenerate mapping. To contrast all features in one gradient back-propagation step, we adopt the proposed optimization-driven unified contrastive loss instead of the conventional contrastive loss. Empirically, our method achieves state-of-the-art results on several benchmark datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22r/li22r.pdf", "supp": "", "pdf_size": 1349594, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13342110327075124099&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Science & Technology on Integrated Information System Laboratory, Institute of Software Chinese Academy of Sciences, Beijing, China+University of Chinese Academy of Sciences, Beijing, China+Southern Marine Science and Engineering Guangdong Laboratory (Guangzhou), Guangdong, China; Science & Technology on Integrated Information System Laboratory, Institute of Software Chinese Academy of Sciences, Beijing, China+University of Chinese Academy of Sciences, Beijing, China+Southern Marine Science and Engineering Guangdong Laboratory (Guangzhou), Guangdong, China; Science & Technology on Integrated Information System Laboratory, Institute of Software Chinese Academy of Sciences, Beijing, China+Southern Marine Science and Engineering Guangdong Laboratory (Guangzhou), Guangdong, China; Gaoling School of Artificial Intelligence, Renmin University of China, Beijing, China+Beijing Key Laboratory of Big Data Management and Analysis Methods, Beijing, China; Thrust of Artificial Intelligence, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China+Department of Computer Science Engineering, The Hong Kong University of Science and Technology, Hong Kong SAR, China", "aff_domain": "iscas.ac.cn;iscas.ac.cn;smse.gdlab.ac.cn;gmail.com;ust.hk", "email": "iscas.ac.cn;iscas.ac.cn;smse.gdlab.ac.cn;gmail.com;ust.hk", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/li22r.html", "aff_unique_index": "0+1+2;0+1+2;0+2;3+4;5+5", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Southern Marine Science and Engineering Guangdong Laboratory;Renmin University of China;Beijing Key Laboratory of Big Data Management and Analysis Methods;Hong Kong University of Science and Technology", "aff_unique_dep": "Institute of Software;;Marine Science and Engineering;Gaoling School of Artificial Intelligence;Big Data Management and Analysis;Thrust of Artificial Intelligence", "aff_unique_url": "http://www.ios.ac.cn;http://www.ucas.ac.cn;;http://www.ruc.edu.cn;;https://www.ust.hk", "aff_unique_abbr": "CAS;UCAS;;RUC;;HKUST", "aff_campus_unique_index": "0+0+1;0+0+1;0+1;0+0;1+2", "aff_campus_unique": "Beijing;Guangzhou;Hong Kong", "aff_country_unique_index": "0+0+0;0+0+0;0+0;0+0;0+0", "aff_country_unique": "China" }, { "title": "Meta-Learning Hypothesis Spaces for Sequential Decision-making", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18151", "id": "18151", "proceeding": "https://proceedings.mlr.press/v162/kassraie22a.html", "poster": "/media/PosterPDFs/ICML%202022/3b5e2c9be5002e87e0477099db5ff21b.png?t=1657896209.85365", "slides": "", "author_site": "Parnian Kassraie, Jonas Rothfuss, Andreas Krause", "author": "Parnian Kassraie; Jonas Rothfuss; Andreas Krause", "abstract": "Obtaining reliable, adaptive confidence sets for prediction functions (hypotheses) is a central challenge in sequential decision-making tasks, such as bandits and model-based reinforcement learning. These confidence sets typically rely on prior assumptions on the hypothesis space, e.g., the known kernel of a Reproducing Kernel Hilbert Space (RKHS). Hand-designing such kernels is error prone, and misspecification may lead to poor or unsafe performance. In this work, we propose to meta-learn a kernel from offline data (Meta-KeL). For the case where the unknown kernel is a combination of known base kernels, we develop an estimator based on structured sparsity. Under mild conditions, we guarantee that our estimated RKHS yields valid confidence sets that, with increasing amounts of offline data, become as tight as those given the true unknown kernel. We demonstrate our approach on the kernelized bandits problem (a.k.a. Bayesian optimization), where we establish regret bounds competitive with those given the true kernel. We also empirically evaluate the effectiveness of our approach on a Bayesian optimization task.", "bibtex": "@InProceedings{pmlr-v162-kassraie22a,\n title = \t {Meta-Learning Hypothesis Spaces for Sequential Decision-making},\n author = {Kassraie, Parnian and Rothfuss, Jonas and Krause, Andreas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10802--10824},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kassraie22a/kassraie22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kassraie22a.html},\n abstract = \t {Obtaining reliable, adaptive confidence sets for prediction functions (hypotheses) is a central challenge in sequential decision-making tasks, such as bandits and model-based reinforcement learning. These confidence sets typically rely on prior assumptions on the hypothesis space, e.g., the known kernel of a Reproducing Kernel Hilbert Space (RKHS). Hand-designing such kernels is error prone, and misspecification may lead to poor or unsafe performance. In this work, we propose to meta-learn a kernel from offline data (Meta-KeL). For the case where the unknown kernel is a combination of known base kernels, we develop an estimator based on structured sparsity. Under mild conditions, we guarantee that our estimated RKHS yields valid confidence sets that, with increasing amounts of offline data, become as tight as those given the true unknown kernel. We demonstrate our approach on the kernelized bandits problem (a.k.a. Bayesian optimization), where we establish regret bounds competitive with those given the true kernel. We also empirically evaluate the effectiveness of our approach on a Bayesian optimization task.}\n}", "pdf": "https://proceedings.mlr.press/v162/kassraie22a/kassraie22a.pdf", "supp": "", "pdf_size": 705677, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8626659388081062344&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "ETH Zurich, Switzerland; ETH Zurich, Switzerland; ETH Zurich, Switzerland", "aff_domain": "ethz.ch; ; ", "email": "ethz.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kassraie22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Metric-Fair Active Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16953", "id": "16953", "proceeding": "https://proceedings.mlr.press/v162/shen22b.html", "poster": "/media/PosterPDFs/ICML%202022/ab817c9349cf9c4f6877e1894a1faa00_E5qX7l5.png?t=1657575588.9997823", "slides": "", "author_site": "Jie Shen, Nan Cui, Jing Wang", "author": "Jie Shen; Nan Cui; Jing Wang", "abstract": "Active learning has become a prevalent technique for designing label-efficient algorithms, where the central principle is to only query and fit \u201cinformative\u201d labeled instances. It is, however, known that an active learning algorithm may incur unfairness due to such instance selection procedure. In this paper, we henceforth study metric-fair active learning of homogeneous halfspaces, and show that under the distribution-dependent PAC learning model, fairness and label efficiency can be achieved simultaneously. We further propose two extensions of our main results: 1) we show that it is possible to make the algorithm robust to the adversarial noise\u00a0\u2013\u00a0one of the most challenging noise models in learning theory; and 2) it is possible to significantly improve the label complexity when the underlying halfspace is sparse.", "bibtex": "@InProceedings{pmlr-v162-shen22b,\n title = \t {Metric-Fair Active Learning},\n author = {Shen, Jie and Cui, Nan and Wang, Jing},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19809--19826},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shen22b/shen22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/shen22b.html},\n abstract = \t {Active learning has become a prevalent technique for designing label-efficient algorithms, where the central principle is to only query and fit \u201cinformative\u201d labeled instances. It is, however, known that an active learning algorithm may incur unfairness due to such instance selection procedure. In this paper, we henceforth study metric-fair active learning of homogeneous halfspaces, and show that under the distribution-dependent PAC learning model, fairness and label efficiency can be achieved simultaneously. We further propose two extensions of our main results: 1) we show that it is possible to make the algorithm robust to the adversarial noise\u00a0\u2013\u00a0one of the most challenging noise models in learning theory; and 2) it is possible to significantly improve the label complexity when the underlying halfspace is sparse.}\n}", "pdf": "https://proceedings.mlr.press/v162/shen22b/shen22b.pdf", "supp": "", "pdf_size": 335491, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12653197446324810385&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science, Stevens Institute of Technology, Hoboken, New Jersey, USA; Department of Computer Science, Stevens Institute of Technology, Hoboken, New Jersey, USA; Amazon, New York City, New York, USA", "aff_domain": "stevens.edu;stevens.edu;gmail.com", "email": "stevens.edu;stevens.edu;gmail.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/shen22b.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Stevens Institute of Technology;Amazon", "aff_unique_dep": "Department of Computer Science;Amazon", "aff_unique_url": "https://www.stevens.edu;https://www.amazon.com", "aff_unique_abbr": "SIT;Amazon", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Hoboken;New York City", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Metric-Fair Classifier Derandomization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17055", "id": "17055", "proceeding": "https://proceedings.mlr.press/v162/wu22a.html", "poster": "/media/PosterPDFs/ICML%202022/3fe78a8acf5fda99de95303940a2420c_45H6w60.png?t=1658178488.430031", "slides": "/media/icml-2022/Slides/17055_rWJGgs5.pdf", "author_site": "Jimmy Wu, Yatong Chen, Yang Liu", "author": "Jimmy Wu; Yatong Chen; Yang Liu", "abstract": "We study the problem of", "bibtex": "@InProceedings{pmlr-v162-wu22a,\n title = \t {Metric-Fair Classifier Derandomization},\n author = {Wu, Jimmy and Chen, Yatong and Liu, Yang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23999--24016},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22a/wu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22a.html},\n abstract = \t {We study the problem of", "pdf": "https://proceedings.mlr.press/v162/wu22a/wu22a.pdf", "supp": "", "pdf_size": 385502, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1917427936255275171&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science and Engineering, University of California, Santa Cruz, USA; Department of Computer Science and Engineering, University of California, Santa Cruz, USA; Department of Computer Science and Engineering, University of California, Santa Cruz, USA", "aff_domain": "gmail.com;ucsc.edu;ucsc.edu", "email": "gmail.com;ucsc.edu;ucsc.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wu22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Santa Cruz", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.ucsc.edu", "aff_unique_abbr": "UCSC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Santa Cruz", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Minimax Classification under Concept Drift with Multidimensional Adaptation and Performance Guarantees", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16093", "id": "16093", "proceeding": "https://proceedings.mlr.press/v162/alvarez22a.html", "poster": "/media/PosterPDFs/ICML%202022/009a5510ad149a8e0c750cb62e255175_g8coRAi.png?t=1655972896.837443", "slides": "/media/icml-2022/Slides/16093.pdf", "author_site": "Ver\u00f3nica \u00c1lvarez, Santiago Mazuelas, Jose A Lozano", "author": "Ver\u00f3nica \u00c1lvarez; Santiago Mazuelas; Jose A Lozano", "abstract": "The statistical characteristics of instance-label pairs often change with time in practical scenarios of supervised classification. Conventional learning techniques adapt to such concept drift accounting for a scalar rate of change by means of a carefully chosen learning rate, forgetting factor, or window size. However, the time changes in common scenarios are multidimensional, i.e., different statistical characteristics often change in a different manner. This paper presents adaptive minimax risk classifiers (AMRCs) that account for multidimensional time changes by means of a multivariate and high-order tracking of the time-varying underlying distribution. In addition, differently from conventional techniques, AMRCs can provide computable tight performance guarantees. Experiments on multiple benchmark datasets show the classification improvement of AMRCs compared to the state-of-the-art and the reliability of the presented performance guarantees.", "bibtex": "@InProceedings{pmlr-v162-alvarez22a,\n title = \t {Minimax Classification under Concept Drift with Multidimensional Adaptation and Performance Guarantees},\n author = {{\\'A}lvarez, Ver{\\'o}nica and Mazuelas, Santiago and Lozano, Jose A},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {486--499},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/alvarez22a/alvarez22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/alvarez22a.html},\n abstract = \t {The statistical characteristics of instance-label pairs often change with time in practical scenarios of supervised classification. Conventional learning techniques adapt to such concept drift accounting for a scalar rate of change by means of a carefully chosen learning rate, forgetting factor, or window size. However, the time changes in common scenarios are multidimensional, i.e., different statistical characteristics often change in a different manner. This paper presents adaptive minimax risk classifiers (AMRCs) that account for multidimensional time changes by means of a multivariate and high-order tracking of the time-varying underlying distribution. In addition, differently from conventional techniques, AMRCs can provide computable tight performance guarantees. Experiments on multiple benchmark datasets show the classification improvement of AMRCs compared to the state-of-the-art and the reliability of the presented performance guarantees.}\n}", "pdf": "https://proceedings.mlr.press/v162/alvarez22a/alvarez22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/alvarez22a-supp.zip", "pdf_size": 575101, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6492087255845076443&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "BCAM-Basque Center for Applied Mathematics, Bilbao, Spain+IKERBASQUE-Basque Foundation for Science; BCAM-Basque Center for Applied Mathematics, Bilbao, Spain+Intelligent Systems Group, University of the Basque Country UPV/EHU, San Sebasti\u00b4 an, Spain; BCAM-Basque Center for Applied Mathematics, Bilbao, Spain+Intelligent Systems Group, University of the Basque Country UPV/EHU, San Sebasti\u00b4 an, Spain", "aff_domain": "bcamath.org;bcamath.org;bcamath.org", "email": "bcamath.org;bcamath.org;bcamath.org", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/alvarez22a.html", "aff_unique_index": "0+1;0+2;0+2", "aff_unique_norm": "Basque Center for Applied Mathematics;IKERBASQUE-Basque Foundation for Science;University of the Basque Country", "aff_unique_dep": "Center for Applied Mathematics;;Intelligent Systems Group", "aff_unique_url": "https://www.bcamath.org/;https://www.ikerbasque.net/;https://www.ehu.eus/en", "aff_unique_abbr": "BCAM;IKERBASQUE;UPV/EHU", "aff_campus_unique_index": "0;0+2;0+2", "aff_campus_unique": "Bilbao;;San Sebasti\u00e1n", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "Spain" }, { "title": "Minimax M-estimation under Adversarial Contamination", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16933", "id": "16933", "proceeding": "https://proceedings.mlr.press/v162/bhatt22a.html", "poster": "/media/PosterPDFs/ICML%202022/f02a8fde79ddf5b978cd9ae9d408b7c1.png?t=1657852227.457207", "slides": "", "author_site": "Sujay Bhatt, Guanhua Fang, Ping Li, Gennady Samorodnitsky", "author": "Sujay Bhatt; Guanhua Fang; Ping Li; Gennady Samorodnitsky", "abstract": "We present a new finite-sample analysis of Catoni\u2019s M-estimator under adversarial contamination, where an adversary is allowed to corrupt a fraction of the samples arbitrarily. We make minimal assumptions on the distribution of the uncontaminated random variables, namely, we only assume the existence of a known upper bound\u00a0$\\upsilon_{\\varepsilon} > 0$ on the\u00a0$(1+\\varepsilon)^{th}$ central moment of the random variables, namely, for\u00a0$\\varepsilon \\in (0,1]$ \\[ \\mathbb{E}_{X_1 \\sim \\mathcal{D}} \\Big| X_1 - \\mu \\Big|^{1+\\varepsilon} \\leq \\upsilon_{\\varepsilon}. \\]{We} provide a lower bound on the minimax error rate for the mean estimation problem under adversarial corruption under this weak assumption, and establish that the proposed M-estimator achieves this lower bound (up to multiplicative constants). When the variance is infinite, the tolerance to contamination of any estimator reduces as\u00a0$\\varepsilon \\downarrow 0$. We establish a tight upper bound that characterizes this bargain. To illustrate the usefulness of the derived robust M-estimator in an online setting, we present a bandit algorithm for the partially identifiable best arm identification problem that improves upon the sample complexity of the state of the art algorithms.", "bibtex": "@InProceedings{pmlr-v162-bhatt22a,\n title = \t {Minimax M-estimation under Adversarial Contamination},\n author = {Bhatt, Sujay and Fang, Guanhua and Li, Ping and Samorodnitsky, Gennady},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1906--1924},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bhatt22a/bhatt22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bhatt22a.html},\n abstract = \t {We present a new finite-sample analysis of Catoni\u2019s M-estimator under adversarial contamination, where an adversary is allowed to corrupt a fraction of the samples arbitrarily. We make minimal assumptions on the distribution of the uncontaminated random variables, namely, we only assume the existence of a known upper bound\u00a0$\\upsilon_{\\varepsilon} > 0$ on the\u00a0$(1+\\varepsilon)^{th}$ central moment of the random variables, namely, for\u00a0$\\varepsilon \\in (0,1]$ \\[ \\mathbb{E}_{X_1 \\sim \\mathcal{D}} \\Big| X_1 - \\mu \\Big|^{1+\\varepsilon} \\leq \\upsilon_{\\varepsilon}. \\]{We} provide a lower bound on the minimax error rate for the mean estimation problem under adversarial corruption under this weak assumption, and establish that the proposed M-estimator achieves this lower bound (up to multiplicative constants). When the variance is infinite, the tolerance to contamination of any estimator reduces as\u00a0$\\varepsilon \\downarrow 0$. We establish a tight upper bound that characterizes this bargain. To illustrate the usefulness of the derived robust M-estimator in an online setting, we present a bandit algorithm for the partially identifiable best arm identification problem that improves upon the sample complexity of the state of the art algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/bhatt22a/bhatt22a.pdf", "supp": "", "pdf_size": 638120, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4659016535701018066&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Cognitive Computing Lab; Baidu Research; School of ORIE; Cornell University", "aff_domain": "gmail.com;gmail.com;gmail.com;cornell.edu", "email": "gmail.com;gmail.com;gmail.com;cornell.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/bhatt22a.html", "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Cognitive Computing Lab;Baidu;Cornell University", "aff_unique_dep": "Cognitive Computing;Baidu Research;School of Operations Research and Information Engineering", "aff_unique_url": ";https://research.baidu.com;https://orie.cornell.edu", "aff_unique_abbr": ";Baidu;ORIE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2;2", "aff_country_unique": ";China;United States" }, { "title": "Minimizing Control for Credit Assignment with Strong Feedback", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18059", "id": "18059", "proceeding": "https://proceedings.mlr.press/v162/meulemans22a.html", "poster": "/media/PosterPDFs/ICML%202022/ff0abbcc0227c9124a804b084d161a2d.png?t=1657267398.132317", "slides": "", "author_site": "Alexander Meulemans, Matilde Tristany Farinha, Maria Cervera, Jo\u00e3o Sacramento, Benjamin F. Grewe", "author": "Alexander Meulemans; Matilde Tristany Farinha; Maria R. Cervera; Jo\u00e3o Sacramento; Benjamin F. Grewe", "abstract": "The success of deep learning ignited interest in whether the brain learns hierarchical representations using gradient-based learning. However, current biologically plausible methods for gradient-based credit assignment in deep neural networks need infinitesimally small feedback signals, which is problematic in biologically realistic noisy environments and at odds with experimental evidence in neuroscience showing that top-down feedback can significantly influence neural activity. Building upon deep feedback control (DFC), a recently proposed credit assignment method, we combine strong feedback influences on neural activity with gradient-based learning and show that this naturally leads to a novel view on neural network optimization. Instead of gradually changing the network weights towards configurations with low output loss, weight updates gradually minimize the amount of feedback required from a controller that drives the network to the supervised output label. Moreover, we show that the use of strong feedback in DFC allows learning forward and feedback connections simultaneously, using learning rules fully local in space and time. We complement our theoretical results with experiments on standard computer-vision benchmarks, showing competitive performance to backpropagation as well as robustness to noise. Overall, our work presents a fundamentally novel view of learning as control minimization, while sidestepping biologically unrealistic assumptions.", "bibtex": "@InProceedings{pmlr-v162-meulemans22a,\n title = \t {Minimizing Control for Credit Assignment with Strong Feedback},\n author = {Meulemans, Alexander and Farinha, Matilde Tristany and Cervera, Maria R. and Sacramento, Jo{\\~a}o and Grewe, Benjamin F.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15458--15483},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/meulemans22a/meulemans22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/meulemans22a.html},\n abstract = \t {The success of deep learning ignited interest in whether the brain learns hierarchical representations using gradient-based learning. However, current biologically plausible methods for gradient-based credit assignment in deep neural networks need infinitesimally small feedback signals, which is problematic in biologically realistic noisy environments and at odds with experimental evidence in neuroscience showing that top-down feedback can significantly influence neural activity. Building upon deep feedback control (DFC), a recently proposed credit assignment method, we combine strong feedback influences on neural activity with gradient-based learning and show that this naturally leads to a novel view on neural network optimization. Instead of gradually changing the network weights towards configurations with low output loss, weight updates gradually minimize the amount of feedback required from a controller that drives the network to the supervised output label. Moreover, we show that the use of strong feedback in DFC allows learning forward and feedback connections simultaneously, using learning rules fully local in space and time. We complement our theoretical results with experiments on standard computer-vision benchmarks, showing competitive performance to backpropagation as well as robustness to noise. Overall, our work presents a fundamentally novel view of learning as control minimization, while sidestepping biologically unrealistic assumptions.}\n}", "pdf": "https://proceedings.mlr.press/v162/meulemans22a/meulemans22a.pdf", "supp": "", "pdf_size": 1112655, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4546119476247760219&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Institute of Neuroinformatics, University of Z\u00a8urich and ETH Z\u00a8urich, Switzerland; Institute of Neuroinformatics, University of Z\u00a8urich and ETH Z\u00a8urich, Switzerland; Institute of Neuroinformatics, University of Z\u00a8urich and ETH Z\u00a8urich, Switzerland; Institute of Neuroinformatics, University of Z\u00a8urich and ETH Z\u00a8urich, Switzerland; Institute of Neuroinformatics, University of Z\u00a8urich and ETH Z\u00a8urich, Switzerland", "aff_domain": "ethz.ch; ; ; ; ", "email": "ethz.ch; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/meulemans22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Z\u00fcrich", "aff_unique_dep": "Institute of Neuroinformatics", "aff_unique_url": "https://www.neuro.ethz.ch/", "aff_unique_abbr": "UZH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Minimum Cost Intervention Design for Causal Effect Identification", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17379", "id": "17379", "proceeding": "https://proceedings.mlr.press/v162/akbari22a.html", "poster": "/media/PosterPDFs/ICML%202022/c058f544c737782deacefa532d9add4c.png?t=1656603507.9614565", "slides": "", "author_site": "Sina Akbari, Jalal Etesami, Negar Kiyavash", "author": "Sina Akbari; Jalal Etesami; Negar Kiyavash", "abstract": "Pearl\u2019s do calculus is a complete axiomatic approach to learn the identifiable causal effects from observational data. When such an effect is not identifiable, it is necessary to perform a collection of often costly interventions in the system to learn the causal effect. In this work, we consider the problem of designing the collection of interventions with the minimum cost to identify the desired effect. First, we prove that this prob-em is NP-complete, and subsequently propose an algorithm that can either find the optimal solution or a logarithmic-factor approximation of it. This is done by establishing a connection between our problem and the minimum hitting set problem. Additionally, we propose several polynomial time heuristic algorithms to tackle the computational complexity of the problem. Although these algorithms could potentially stumble on sub-optimal solutions, our simulations show that they achieve small regrets on random graphs.", "bibtex": "@InProceedings{pmlr-v162-akbari22a,\n title = \t {Minimum Cost Intervention Design for Causal Effect Identification},\n author = {Akbari, Sina and Etesami, Jalal and Kiyavash, Negar},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {258--289},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/akbari22a/akbari22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/akbari22a.html},\n abstract = \t {Pearl\u2019s do calculus is a complete axiomatic approach to learn the identifiable causal effects from observational data. When such an effect is not identifiable, it is necessary to perform a collection of often costly interventions in the system to learn the causal effect. In this work, we consider the problem of designing the collection of interventions with the minimum cost to identify the desired effect. First, we prove that this prob-em is NP-complete, and subsequently propose an algorithm that can either find the optimal solution or a logarithmic-factor approximation of it. This is done by establishing a connection between our problem and the minimum hitting set problem. Additionally, we propose several polynomial time heuristic algorithms to tackle the computational complexity of the problem. Although these algorithms could potentially stumble on sub-optimal solutions, our simulations show that they achieve small regrets on random graphs.}\n}", "pdf": "https://proceedings.mlr.press/v162/akbari22a/akbari22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/akbari22a-supp.zip", "pdf_size": 789567, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8464705336757566822&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "College of Management of Technology, EPFL; College of Management of Technology, EPFL; College of Management of Technology, EPFL", "aff_domain": "epfl.ch; ; ", "email": "epfl.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/akbari22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "College of Management of Technology", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Mirror Learning: A Unifying Framework of Policy Optimisation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17441", "id": "17441", "proceeding": "https://proceedings.mlr.press/v162/grudzien22a.html", "poster": "/media/PosterPDFs/ICML%202022/892c91e0a653ba19df81a90f89d99bcd_FAEVCpp.png?t=1657973586.6947412", "slides": "/media/icml-2022/Slides/17441.pdf", "author_site": "Jakub Grudzien Kuba, Christian Schroeder de Witt, Jakob Foerster", "author": "Jakub Grudzien; Christian A Schroeder De Witt; Jakob Foerster", "abstract": "Modern deep reinforcement learning (RL) algorithms are motivated by either the general policy improvement (GPI) or trust-region learning (TRL) frameworks. However, algorithms that strictly respect these theoretical frameworks have proven unscalable. Surprisingly, the only known scalable algorithms violate the GPI/TRL assumptions, e.g. due to required regularisation or other heuristics. The current explanation of their empirical success is essentially \u201cby analogy\u201d: they are deemed approximate adaptations of theoretically sound methods. Unfortunately, studies have shown that in practice these algorithms differ greatly from their conceptual ancestors. In contrast, in this paper, we introduce a novel theoretical framework, named Mirror Learning, which provides theoretical guarantees to a large class of algorithms, including TRPO and PPO. While the latter two exploit the flexibility of our framework, GPI and TRL fit in merely as pathologically restrictive corner cases thereof. This suggests that the empirical performance of state-of-the-art methods is a direct consequence of their theoretical properties, rather than of aforementioned approximate analogies. Mirror learning sets us free to boldly explore novel, theoretically sound RL algorithms, a thus far uncharted wonderland.", "bibtex": "@InProceedings{pmlr-v162-grudzien22a,\n title = \t {Mirror Learning: A Unifying Framework of Policy Optimisation},\n author = {Grudzien, Jakub and De Witt, Christian A Schroeder and Foerster, Jakob},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7825--7844},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/grudzien22a/grudzien22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/grudzien22a.html},\n abstract = \t {Modern deep reinforcement learning (RL) algorithms are motivated by either the general policy improvement (GPI) or trust-region learning (TRL) frameworks. However, algorithms that strictly respect these theoretical frameworks have proven unscalable. Surprisingly, the only known scalable algorithms violate the GPI/TRL assumptions, e.g. due to required regularisation or other heuristics. The current explanation of their empirical success is essentially \u201cby analogy\u201d: they are deemed approximate adaptations of theoretically sound methods. Unfortunately, studies have shown that in practice these algorithms differ greatly from their conceptual ancestors. In contrast, in this paper, we introduce a novel theoretical framework, named Mirror Learning, which provides theoretical guarantees to a large class of algorithms, including TRPO and PPO. While the latter two exploit the flexibility of our framework, GPI and TRL fit in merely as pathologically restrictive corner cases thereof. This suggests that the empirical performance of state-of-the-art methods is a direct consequence of their theoretical properties, rather than of aforementioned approximate analogies. Mirror learning sets us free to boldly explore novel, theoretically sound RL algorithms, a thus far uncharted wonderland.}\n}", "pdf": "https://proceedings.mlr.press/v162/grudzien22a/grudzien22a.pdf", "supp": "", "pdf_size": 3061455, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3875011415872295782&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of Oxford; University of Oxford; University of Oxford", "aff_domain": "new.ox.ac.uk; ; ", "email": "new.ox.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/grudzien22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Mitigating Gender Bias in Face Recognition using the von Mises-Fisher Mixture Model", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17419", "id": "17419", "proceeding": "https://proceedings.mlr.press/v162/conti22a.html", "poster": "/media/PosterPDFs/ICML%202022/d6dabcc412981d56c8733b52586a9d44.png?t=1657561481.0927567", "slides": "", "author_site": "Jean-R\u00e9my Conti, Nathan NOIRY, Stephan Clemencon, Vincent Despiegel, St\u00e9phane Gentric", "author": "Jean-R\u00e9my Conti; Nathan Noiry; Stephan Clemencon; Vincent Despiegel; St\u00e9phane Gentric", "abstract": "In spite of the high performance and reliability of deep learning algorithms in a wide range of everyday applications, many investigations tend to show that a lot of models exhibit biases, discriminating against specific subgroups of the population (e.g. gender, ethnicity). This urges the practitioner to develop fair systems with a uniform/comparable performance across sensitive groups. In this work, we investigate the gender bias of deep Face Recognition networks. In order to measure this bias, we introduce two new metrics, BFAR and BFRR, that better reflect the inherent deployment needs of Face Recognition systems. Motivated by geometric considerations, we mitigate gender bias through a new post-processing methodology which transforms the deep embeddings of a pre-trained model to give more representation power to discriminated subgroups. It consists in training a shallow neural network by minimizing a Fair von Mises-Fisher loss whose hyperparameters account for the intra-class variance of each gender. Interestingly, we empirically observe that these hyperparameters are correlated with our fairness metrics. In fact, extensive numerical experiments on a variety of datasets show that a careful selection significantly reduces gender bias.", "bibtex": "@InProceedings{pmlr-v162-conti22a,\n title = \t {Mitigating Gender Bias in Face Recognition using the von Mises-{F}isher Mixture Model},\n author = {Conti, Jean-R{\\'e}my and Noiry, Nathan and Clemencon, Stephan and Despiegel, Vincent and Gentric, St{\\'e}phane},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4344--4369},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/conti22a/conti22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/conti22a.html},\n abstract = \t {In spite of the high performance and reliability of deep learning algorithms in a wide range of everyday applications, many investigations tend to show that a lot of models exhibit biases, discriminating against specific subgroups of the population (e.g. gender, ethnicity). This urges the practitioner to develop fair systems with a uniform/comparable performance across sensitive groups. In this work, we investigate the gender bias of deep Face Recognition networks. In order to measure this bias, we introduce two new metrics, BFAR and BFRR, that better reflect the inherent deployment needs of Face Recognition systems. Motivated by geometric considerations, we mitigate gender bias through a new post-processing methodology which transforms the deep embeddings of a pre-trained model to give more representation power to discriminated subgroups. It consists in training a shallow neural network by minimizing a Fair von Mises-Fisher loss whose hyperparameters account for the intra-class variance of each gender. Interestingly, we empirically observe that these hyperparameters are correlated with our fairness metrics. In fact, extensive numerical experiments on a variety of datasets show that a careful selection significantly reduces gender bias.}\n}", "pdf": "https://proceedings.mlr.press/v162/conti22a/conti22a.pdf", "supp": "", "pdf_size": 1668170, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11800206203871099663&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "LTCI, Telecom Paris, Institut Polytechnique de Paris+Idemia; LTCI, Telecom Paris, Institut Polytechnique de Paris+Idemia; Idemia; Idemia; LTCI, Telecom Paris, Institut Polytechnique de Paris", "aff_domain": "telecom-paris.fr;gmail.com; ; ; ", "email": "telecom-paris.fr;gmail.com; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/conti22a.html", "aff_unique_index": "0+1;0+1;1;1;0", "aff_unique_norm": "Telecom Paris;IDEMIA", "aff_unique_dep": "LTCI;", "aff_unique_url": "https://www.telecom-paris.fr;https://www.idemia.com", "aff_unique_abbr": "Telecom Paris;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0;0", "aff_country_unique": "France" }, { "title": "Mitigating Modality Collapse in Multimodal VAEs via Impartial Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17491", "id": "17491", "proceeding": "https://proceedings.mlr.press/v162/javaloy22a.html", "poster": "/media/PosterPDFs/ICML%202022/b7da6669894867f04b8727876a69ffc0.png?t=1657660995.7706163", "slides": "", "author_site": "Adri\u00e1n Javaloy, Maryam Meghdadi, Isabel Valera", "author": "Adrian Javaloy; Maryam Meghdadi; Isabel Valera", "abstract": "A number of variational autoencoders (VAEs) have recently emerged with the aim of modeling multimodal data, e.g., to jointly model images and their corresponding captions. Still, multimodal VAEs tend to focus solely on a subset of the modalities, e.g., by fitting the image while neglecting the caption. We refer to this limitation as modality collapse. In this work, we argue that this effect is a consequence of conflicting gradients during multimodal VAE training. We show how to detect the sub-graphs in the computational graphs where gradients conflict (impartiality blocks), as well as how to leverage existing gradient-conflict solutions from multitask learning to mitigate modality collapse. That is, to ensure impartial optimization across modalities. We apply our training framework to several multimodal VAE models, losses and datasets from the literature, and empirically show that our framework significantly improves the reconstruction performance, conditional generation, and coherence of the latent space across modalities.", "bibtex": "@InProceedings{pmlr-v162-javaloy22a,\n title = \t {Mitigating Modality Collapse in Multimodal {VAE}s via Impartial Optimization},\n author = {Javaloy, Adrian and Meghdadi, Maryam and Valera, Isabel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9938--9964},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/javaloy22a/javaloy22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/javaloy22a.html},\n abstract = \t {A number of variational autoencoders (VAEs) have recently emerged with the aim of modeling multimodal data, e.g., to jointly model images and their corresponding captions. Still, multimodal VAEs tend to focus solely on a subset of the modalities, e.g., by fitting the image while neglecting the caption. We refer to this limitation as modality collapse. In this work, we argue that this effect is a consequence of conflicting gradients during multimodal VAE training. We show how to detect the sub-graphs in the computational graphs where gradients conflict (impartiality blocks), as well as how to leverage existing gradient-conflict solutions from multitask learning to mitigate modality collapse. That is, to ensure impartial optimization across modalities. We apply our training framework to several multimodal VAE models, losses and datasets from the literature, and empirically show that our framework significantly improves the reconstruction performance, conditional generation, and coherence of the latent space across modalities.}\n}", "pdf": "https://proceedings.mlr.press/v162/javaloy22a/javaloy22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/javaloy22a-supp.zip", "pdf_size": 6071065, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14600839373536938661&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Saarland University, Germany+MPI for Software Systems, Saarland, Germany; Department of Computer Science, Saarland University, Germany+MPI for Software Systems, Saarland, Germany; Department of Computer Science, Saarland University, Germany+MPI for Software Systems, Saarland, Germany", "aff_domain": "cs.uni-saarland.de; ; ", "email": "cs.uni-saarland.de; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/javaloy22a.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "Saarland University;Max Planck Institute for Software Systems", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.uni-saarland.de;https://www.mpi-sws.org", "aff_unique_abbr": ";MPI-SWS", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "Germany" }, { "title": "Mitigating Neural Network Overconfidence with Logit Normalization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16539", "id": "16539", "proceeding": "https://proceedings.mlr.press/v162/wei22d.html", "poster": "/media/PosterPDFs/ICML%202022/eaae5e04a259d09af85c108fe4d7dd0c.png?t=1657714638.4341052", "slides": "", "author_site": "Hongxin Wei, RENCHUNZI XIE, Hao Cheng, LEI FENG, Bo An, Yixuan Li", "author": "Hongxin Wei; Renchunzi Xie; Hao Cheng; Lei Feng; Bo An; Yixuan Li", "abstract": "Detecting out-of-distribution inputs is critical for the safe deployment of machine learning models in the real world. However, neural networks are known to suffer from the overconfidence issue, where they produce abnormally high confidence for both in- and out-of-distribution inputs. In this work, we show that this issue can be mitigated through Logit Normalization (LogitNorm)\u2014a simple fix to the cross-entropy loss\u2014by enforcing a constant vector norm on the logits in training. Our method is motivated by the analysis that the norm of the logit keeps increasing during training, leading to overconfident output. Our key idea behind LogitNorm is thus to decouple the influence of output\u2019s norm during network optimization. Trained with LogitNorm, neural networks produce highly distinguishable confidence scores between in- and out-of-distribution data. Extensive experiments demonstrate the superiority of LogitNorm, reducing the average FPR95 by up to 42.30% on common benchmarks.", "bibtex": "@InProceedings{pmlr-v162-wei22d,\n title = \t {Mitigating Neural Network Overconfidence with Logit Normalization},\n author = {Wei, Hongxin and Xie, Renchunzi and Cheng, Hao and Feng, Lei and An, Bo and Li, Yixuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23631--23644},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wei22d/wei22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/wei22d.html},\n abstract = \t {Detecting out-of-distribution inputs is critical for the safe deployment of machine learning models in the real world. However, neural networks are known to suffer from the overconfidence issue, where they produce abnormally high confidence for both in- and out-of-distribution inputs. In this work, we show that this issue can be mitigated through Logit Normalization (LogitNorm)\u2014a simple fix to the cross-entropy loss\u2014by enforcing a constant vector norm on the logits in training. Our method is motivated by the analysis that the norm of the logit keeps increasing during training, leading to overconfident output. Our key idea behind LogitNorm is thus to decouple the influence of output\u2019s norm during network optimization. Trained with LogitNorm, neural networks produce highly distinguishable confidence scores between in- and out-of-distribution data. Extensive experiments demonstrate the superiority of LogitNorm, reducing the average FPR95 by up to 42.30% on common benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/wei22d/wei22d.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wei22d-supp.zip", "pdf_size": 907548, "gs_citation": 362, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3765768230173383060&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Nanyang Technological University, Singapore; Nanyang Technological University, Singapore; Nanyang Technological University, Singapore + Nanjing University, Nanjing, Jiangsu, China; Chongqing University, Chongqing, China; Nanyang Technological University, Singapore; University of Wisconsin-Madison, Wisconsin, United States", "aff_domain": "ntu.edu.sg;e.ntu.edu.sg;ntu.edu.sg;cqu.edu.cn;ntu.edu.sg;wisc.edu", "email": "ntu.edu.sg;e.ntu.edu.sg;ntu.edu.sg;cqu.edu.cn;ntu.edu.sg;wisc.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wei22d.html", "aff_unique_index": "0;0;0+1;2;0;3", "aff_unique_norm": "Nanyang Technological University;Nanjing University;Chongqing University;University of Wisconsin-Madison", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ntu.edu.sg;http://www.nju.edu.cn;http://www.cqu.edu.cn/;https://www.wisc.edu", "aff_unique_abbr": "NTU;Nanjing U;CQU;UW-Madison", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Nanjing;Chongqing;Madison", "aff_country_unique_index": "0;0;0+1;1;0;2", "aff_country_unique": "Singapore;China;United States" }, { "title": "ModLaNets: Learning Generalisable Dynamics via Modularity and Physical Inductive Bias", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17615", "id": "17615", "proceeding": "https://proceedings.mlr.press/v162/lu22c.html", "poster": "", "slides": "", "author_site": "Yupu Lu, Shijie Lin, Guanqi Chen, Jia Pan", "author": "Yupu Lu; Shijie Lin; Guanqi Chen; Jia Pan", "abstract": "Deep learning models are able to approximate one specific dynamical system but struggle at learning generalisable dynamics, where dynamical systems obey the same laws of physics but contain different numbers of elements (e.g., double- and triple-pendulum systems). To relieve this issue, we proposed the Modular Lagrangian Network (ModLaNet), a structural neural network framework with modularity and physical inductive bias. This framework models the energy of each element using modularity and then construct the target dynamical system via Lagrangian mechanics. Modularity is beneficial for reusing trained networks and reducing the scale of networks and datasets. As a result, our framework can learn from the dynamics of simpler systems and extend to more complex ones, which is not feasible using other relevant physics-informed neural networks. We examine our framework for modelling double-pendulum or three-body systems with small training datasets, where our models achieve the best data efficiency and accuracy performance compared with counterparts. We also reorganise our models as extensions to model multi-pendulum and multi-body systems, demonstrating the intriguing reusable feature of our framework.", "bibtex": "@InProceedings{pmlr-v162-lu22c,\n title = \t {{M}od{L}a{N}ets: Learning Generalisable Dynamics via Modularity and Physical Inductive Bias},\n author = {Lu, Yupu and Lin, Shijie and Chen, Guanqi and Pan, Jia},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14384--14397},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lu22c/lu22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/lu22c.html},\n abstract = \t {Deep learning models are able to approximate one specific dynamical system but struggle at learning generalisable dynamics, where dynamical systems obey the same laws of physics but contain different numbers of elements (e.g., double- and triple-pendulum systems). To relieve this issue, we proposed the Modular Lagrangian Network (ModLaNet), a structural neural network framework with modularity and physical inductive bias. This framework models the energy of each element using modularity and then construct the target dynamical system via Lagrangian mechanics. Modularity is beneficial for reusing trained networks and reducing the scale of networks and datasets. As a result, our framework can learn from the dynamics of simpler systems and extend to more complex ones, which is not feasible using other relevant physics-informed neural networks. We examine our framework for modelling double-pendulum or three-body systems with small training datasets, where our models achieve the best data efficiency and accuracy performance compared with counterparts. We also reorganise our models as extensions to model multi-pendulum and multi-body systems, demonstrating the intriguing reusable feature of our framework.}\n}", "pdf": "https://proceedings.mlr.press/v162/lu22c/lu22c.pdf", "supp": "", "pdf_size": 5655590, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13273673478017721155&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, The University of Hong Kong, Hong Kong SAR, China+Centre for Garment Production Limited (TransGP), Hong Kong SAR, China; Department of Computer Science, The University of Hong Kong, Hong Kong SAR, China+Centre for Garment Production Limited (TransGP), Hong Kong SAR, China; Department of Computer Science, The University of Hong Kong, Hong Kong SAR, China; Department of Computer Science, The University of Hong Kong, Hong Kong SAR, China+Centre for Garment Production Limited (TransGP), Hong Kong SAR, China", "aff_domain": "cs.hku.hk; ; ;cs.hku.hk", "email": "cs.hku.hk; ; ;cs.hku.hk", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lu22c.html", "aff_unique_index": "0+1;0+1;0;0+1", "aff_unique_norm": "University of Hong Kong;Centre for Garment Production Limited", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.hku.hk;", "aff_unique_abbr": "HKU;TransGP", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0+0;0;0+0", "aff_country_unique": "China" }, { "title": "Modality Competition: What Makes Joint Training of Multi-modal Network Fail in Deep Learning? (Provably)", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17617", "id": "17617", "proceeding": "https://proceedings.mlr.press/v162/huang22e.html", "poster": "/media/PosterPDFs/ICML%202022/038d5463327addf90d282c35be4c5eb1.png?t=1657203001.0239382", "slides": "", "author_site": "Yu Huang, Junyang Lin, Chang Zhou, Hongxia Yang, Longbo Huang", "author": "Yu Huang; Junyang Lin; Chang Zhou; Hongxia Yang; Longbo Huang", "abstract": "Despite the remarkable success of deep multi-modal learning in practice, it has not been well-explained in theory. Recently, it has been observed that the best uni-modal network outperforms the jointly trained multi-modal network across different combinations of modalities on various tasks, which is counter-intuitive since multiple signals would bring more information (Wang et al., 2020). This work provides a theoretical explanation for the emergence of such performance gap in neural networks for the prevalent joint training framework. Based on a simplified data distribution that captures the realistic property of multi-modal data, we prove that for multi-modal late-fusion network with (smoothed) ReLU activation trained jointly by gradient descent, different modalities will compete with each other and only a subset of modalities will be learned by its corresponding encoder networks. We refer to this phenomenon as modality competition, and the losing modalities, which fail to be discovered, are the origins where the sub-optimality of joint training comes from. In contrast, for uni-modal networks with similar learning settings, we provably show that the networks will focus on learning modality-associated features. Experimentally, we illustrate that modality competition matches the intrinsic behavior of late-fusion joint training to supplement our theoretical results. To the best of our knowledge, our work is the first theoretical treatment towards the degenerating aspect of multi-modal learning in neural networks.", "bibtex": "@InProceedings{pmlr-v162-huang22e,\n title = \t {Modality Competition: What Makes Joint Training of Multi-modal Network Fail in Deep Learning? ({P}rovably)},\n author = {Huang, Yu and Lin, Junyang and Zhou, Chang and Yang, Hongxia and Huang, Longbo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9226--9259},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22e/huang22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22e.html},\n abstract = \t {Despite the remarkable success of deep multi-modal learning in practice, it has not been well-explained in theory. Recently, it has been observed that the best uni-modal network outperforms the jointly trained multi-modal network across different combinations of modalities on various tasks, which is counter-intuitive since multiple signals would bring more information (Wang et al., 2020). This work provides a theoretical explanation for the emergence of such performance gap in neural networks for the prevalent joint training framework. Based on a simplified data distribution that captures the realistic property of multi-modal data, we prove that for multi-modal late-fusion network with (smoothed) ReLU activation trained jointly by gradient descent, different modalities will compete with each other and only a subset of modalities will be learned by its corresponding encoder networks. We refer to this phenomenon as modality competition, and the losing modalities, which fail to be discovered, are the origins where the sub-optimality of joint training comes from. In contrast, for uni-modal networks with similar learning settings, we provably show that the networks will focus on learning modality-associated features. Experimentally, we illustrate that modality competition matches the intrinsic behavior of late-fusion joint training to supplement our theoretical results. To the best of our knowledge, our work is the first theoretical treatment towards the degenerating aspect of multi-modal learning in neural networks.}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22e/huang22e.pdf", "supp": "", "pdf_size": 2481595, "gs_citation": 123, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15736628291086189730&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University, China; DAMO Academy, Alibaba Group, China; DAMO Academy, Alibaba Group, China; DAMO Academy, Alibaba Group, China; Institute for Interdisciplinary Information Sciences, Tsinghua University, China", "aff_domain": "tsinghua.edu.cn; ; ; ;tsinghua.edu.cn", "email": "tsinghua.edu.cn; ; ; ;tsinghua.edu.cn", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/huang22e.html", "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "Tsinghua University;Alibaba Group", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;DAMO Academy", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "Tsinghua;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Model Agnostic Sample Reweighting for Out-of-Distribution Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17867", "id": "17867", "proceeding": "https://proceedings.mlr.press/v162/zhou22d.html", "poster": "/media/PosterPDFs/ICML%202022/55743cc0393b1cb4b8b37d09ae48d097.png?t=1657706061.8561294", "slides": "", "author_site": "Xiao Zhou, Yong LIN, Renjie Pi, Weizhong Zhang, Renzhe Xu, Peng Cui, Tong Zhang", "author": "Xiao Zhou; Yong Lin; Renjie Pi; Weizhong Zhang; Renzhe Xu; Peng Cui; Tong Zhang", "abstract": "Distributionally robust optimization (DRO) and invariant risk minimization (IRM) are two popular methods proposed to improve out-of-distribution (OOD) generalization performance of machine learning models. While effective for small models, it has been observed that these methods can be vulnerable to overfitting with large overparameterized models. This work proposes a principled method, Model Agnostic samPLe rEweighting (MAPLE), to effectively address OOD problem, especially in overparameterized scenarios. Our key idea is to find an effective reweighting of the training samples so that the standard empirical risk minimization training of a large model on the weighted training data leads to superior OOD generalization performance. The overfitting issue is addressed by considering a bilevel formulation to search for the sample reweighting, in which the generalization complexity depends on the search space of sample weights instead of the model size. We present theoretical analysis in linear case to prove the insensitivity of MAPLE to model size, and empirically verify its superiority in surpassing state-of-the-art methods by a large margin.", "bibtex": "@InProceedings{pmlr-v162-zhou22d,\n title = \t {Model Agnostic Sample Reweighting for Out-of-Distribution Learning},\n author = {Zhou, Xiao and Lin, Yong and Pi, Renjie and Zhang, Weizhong and Xu, Renzhe and Cui, Peng and Zhang, Tong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27203--27221},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22d/zhou22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22d.html},\n abstract = \t {Distributionally robust optimization (DRO) and invariant risk minimization (IRM) are two popular methods proposed to improve out-of-distribution (OOD) generalization performance of machine learning models. While effective for small models, it has been observed that these methods can be vulnerable to overfitting with large overparameterized models. This work proposes a principled method, Model Agnostic samPLe rEweighting (MAPLE), to effectively address OOD problem, especially in overparameterized scenarios. Our key idea is to find an effective reweighting of the training samples so that the standard empirical risk minimization training of a large model on the weighted training data leads to superior OOD generalization performance. The overfitting issue is addressed by considering a bilevel formulation to search for the sample reweighting, in which the generalization complexity depends on the search space of sample weights instead of the model size. We present theoretical analysis in linear case to prove the insensitivity of MAPLE to model size, and empirically verify its superiority in surpassing state-of-the-art methods by a large margin.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22d/zhou22d.pdf", "supp": "", "pdf_size": 4843422, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4328634809674273852&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "The Hong Kong University of Science and Technology; The Hong Kong University of Science and Technology; The Hong Kong University of Science and Technology; The Hong Kong University of Science and Technology; Tsinghua University; Tsinghua University; Google Research", "aff_domain": "ust.hk;ust.hk;ust.hk;ust.hk;tsinghua.edu.cn;tsinghua.edu.cn;tongzhang-ml.org", "email": "ust.hk;ust.hk;ust.hk;ust.hk;tsinghua.edu.cn;tsinghua.edu.cn;tongzhang-ml.org", "github": "https://github.com/x-zho14/MAPLE", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/zhou22d.html", "aff_unique_index": "0;0;0;0;1;1;2", "aff_unique_norm": "Hong Kong University of Science and Technology;Tsinghua University;Google", "aff_unique_dep": ";;Google Research", "aff_unique_url": "https://www.ust.hk;https://www.tsinghua.edu.cn;https://research.google", "aff_unique_abbr": "HKUST;THU;Google Research", "aff_campus_unique_index": "0;0;0;0;2", "aff_campus_unique": "Hong Kong SAR;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Model Selection in Batch Policy Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17069", "id": "17069", "proceeding": "https://proceedings.mlr.press/v162/lee22k.html", "poster": "/media/PosterPDFs/ICML%202022/20546457187cf3d52ea86538403e47cc.png?t=1657488993.324945", "slides": "", "author_site": "Jonathan Lee, George Tucker, Ofir Nachum, Bo Dai", "author": "Jonathan Lee; George Tucker; Ofir Nachum; Bo Dai", "abstract": "We study the problem of model selection in batch policy optimization: given a fixed, partial-feedback dataset and M model classes, learn a policy with performance that is competitive with the policy derived from the best model class. We formalize the problem in the contextual bandit setting with linear model classes by identifying three sources of error that any model selection algorithm should optimally trade-off in order to be competitive: (1) approximation error, (2) statistical complexity, and (3) coverage. The first two sources are common in model selection for supervised learning, where optimally trading off these two is well-studied. In contrast, the third source is unique to batch policy optimization and is due to dataset shift inherent to the setting. We first show that no batch policy optimization algorithm can achieve a guarantee addressing all three simultaneously, revealing a stark contrast between difficulties in batch policy optimization and the positive results available in supervised learning. Despite this negative result, we show that relaxing any one of the three error sources enables the design of algorithms achieving near-oracle inequalities for the remaining two. We conclude with experiments demonstrating the efficacy of these algorithms.", "bibtex": "@InProceedings{pmlr-v162-lee22k,\n title = \t {Model Selection in Batch Policy Optimization},\n author = {Lee, Jonathan and Tucker, George and Nachum, Ofir and Dai, Bo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12542--12569},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lee22k/lee22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/lee22k.html},\n abstract = \t {We study the problem of model selection in batch policy optimization: given a fixed, partial-feedback dataset and M model classes, learn a policy with performance that is competitive with the policy derived from the best model class. We formalize the problem in the contextual bandit setting with linear model classes by identifying three sources of error that any model selection algorithm should optimally trade-off in order to be competitive: (1) approximation error, (2) statistical complexity, and (3) coverage. The first two sources are common in model selection for supervised learning, where optimally trading off these two is well-studied. In contrast, the third source is unique to batch policy optimization and is due to dataset shift inherent to the setting. We first show that no batch policy optimization algorithm can achieve a guarantee addressing all three simultaneously, revealing a stark contrast between difficulties in batch policy optimization and the positive results available in supervised learning. Despite this negative result, we show that relaxing any one of the three error sources enables the design of algorithms achieving near-oracle inequalities for the remaining two. We conclude with experiments demonstrating the efficacy of these algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/lee22k/lee22k.pdf", "supp": "", "pdf_size": 467934, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3257223513386489741&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science, Stanford University, USA+Google Research, Mountain View, USA; Google Research, Mountain View, USA; Google Research, Mountain View, USA; Google Research, Mountain View, USA", "aff_domain": "stanford.edu; ; ; ", "email": "stanford.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lee22k.html", "aff_unique_index": "0+1;1;1;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": "Department of Computer Science;Google Research", "aff_unique_url": "https://www.stanford.edu;https://research.google", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0+1;1;1;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16989", "id": "16989", "proceeding": "https://proceedings.mlr.press/v162/wortsman22a.html", "poster": "", "slides": "", "author_site": "Mitchell Wortsman, Gabriel Ilharco, Samir Gadre, Rebecca Roelofs, Raphael Gontijo Lopes, Ari Morcos, Hongseok Namkoong, Ali Farhadi, Yair Carmon, Simon Kornblith, Ludwig Schmidt", "author": "Mitchell Wortsman; Gabriel Ilharco; Samir Ya Gadre; Rebecca Roelofs; Raphael Gontijo-Lopes; Ari S Morcos; Hongseok Namkoong; Ali Farhadi; Yair Carmon; Simon Kornblith; Ludwig Schmidt", "abstract": "The conventional recipe for maximizing model accuracy is to (1) train multiple models with various hyperparameters and (2) pick the individual model which performs best on a held-out validation set, discarding the remainder. In this paper, we revisit the second step of this procedure in the context of fine-tuning large pre-trained models, where fine-tuned models often appear to lie in a single low error basin. We show that averaging the weights of multiple models fine-tuned with different hyperparameter configurations often improves accuracy and robustness. Unlike a conventional ensemble, we may average many models without incurring any additional inference or memory costs\u2014we call the results \u201cmodel soups.\u201d When fine-tuning large pre-trained models such as CLIP, ALIGN, and a ViT-G pre-trained on JFT, our soup recipe provides significant improvements over the best model in a hyperparameter sweep on ImageNet. The resulting ViT-G model, which attains 90.94% top-1 accuracy on ImageNet, achieved a new state of the art. Furthermore, we show that the model soup approach extends to multiple image classification and natural language processing tasks, improves out-of-distribution performance, and improves zero-shot performance on new downstream tasks. Finally, we analytically relate the performance similarity of weight-averaging and logit-ensembling to flatness of the loss and confidence of the predictions, and validate this relation empirically. Code is available at https://github.com/mlfoundations/model-soups.", "bibtex": "@InProceedings{pmlr-v162-wortsman22a,\n title = \t {Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time},\n author = {Wortsman, Mitchell and Ilharco, Gabriel and Gadre, Samir Ya and Roelofs, Rebecca and Gontijo-Lopes, Raphael and Morcos, Ari S and Namkoong, Hongseok and Farhadi, Ali and Carmon, Yair and Kornblith, Simon and Schmidt, Ludwig},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23965--23998},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wortsman22a/wortsman22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/wortsman22a.html},\n abstract = \t {The conventional recipe for maximizing model accuracy is to (1) train multiple models with various hyperparameters and (2) pick the individual model which performs best on a held-out validation set, discarding the remainder. In this paper, we revisit the second step of this procedure in the context of fine-tuning large pre-trained models, where fine-tuned models often appear to lie in a single low error basin. We show that averaging the weights of multiple models fine-tuned with different hyperparameter configurations often improves accuracy and robustness. Unlike a conventional ensemble, we may average many models without incurring any additional inference or memory costs\u2014we call the results \u201cmodel soups.\u201d When fine-tuning large pre-trained models such as CLIP, ALIGN, and a ViT-G pre-trained on JFT, our soup recipe provides significant improvements over the best model in a hyperparameter sweep on ImageNet. The resulting ViT-G model, which attains 90.94% top-1 accuracy on ImageNet, achieved a new state of the art. Furthermore, we show that the model soup approach extends to multiple image classification and natural language processing tasks, improves out-of-distribution performance, and improves zero-shot performance on new downstream tasks. Finally, we analytically relate the performance similarity of weight-averaging and logit-ensembling to flatness of the loss and confidence of the predictions, and validate this relation empirically. Code is available at https://github.com/mlfoundations/model-soups.}\n}", "pdf": "https://proceedings.mlr.press/v162/wortsman22a/wortsman22a.pdf", "supp": "", "pdf_size": 1243707, "gs_citation": 1132, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16922194924900565989&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "University of Washington; Columbia University; Google Research, Brain Team; Meta AI Research; Tel Aviv University; University of Washington; Columbia University; University of Washington; Tel Aviv University; Google Research, Brain Team; University of Washington", "aff_domain": "uw.edu; ; ; ; ; ; ; ; ; ; ", "email": "uw.edu; ; ; ; ; ; ; ; ; ; ", "github": "https://github.com/mlfoundations/model-soups", "project": "", "author_num": 11, "oa": "https://proceedings.mlr.press/v162/wortsman22a.html", "aff_unique_index": "0;1;2;3;4;0;1;0;4;2;0", "aff_unique_norm": "University of Washington;Columbia University;Google;Meta;Tel Aviv University", "aff_unique_dep": ";;Google Research;Meta AI Research;", "aff_unique_url": "https://www.washington.edu;https://www.columbia.edu;https://research.google;https://meta.com;https://www.tau.ac.il", "aff_unique_abbr": "UW;Columbia;Google;Meta AI;TAU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;1;0;0;0;1;0;0", "aff_country_unique": "United States;Israel" }, { "title": "Model-Free Opponent Shaping", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17105", "id": "17105", "proceeding": "https://proceedings.mlr.press/v162/lu22d.html", "poster": "/media/PosterPDFs/ICML%202022/1a04f965818a8533f5613003c7db243d.png?t=1658126724.3716214", "slides": "", "author_site": "Christopher Lu, Timon Willi, Christian Schroeder de Witt, Jakob Foerster", "author": "Christopher Lu; Timon Willi; Christian A Schroeder De Witt; Jakob Foerster", "abstract": "In general-sum games the interaction of self-interested learning agents commonly leads to collectively worst-case outcomes, such as defect-defect in the iterated prisoner\u2019s dilemma (IPD). To overcome this, some methods, such as Learning with Opponent-Learning Awareness (LOLA), directly shape the learning process of their opponents. However, these methods are myopic since only a small number of steps can be anticipated, are asymmetric since they treat other agents as naive learners, and require the use of higher-order derivatives, which are calculated through white-box access to an opponent\u2019s differentiable learning algorithm. To address these issues, we propose Model-Free Opponent Shaping (M-FOS). M-FOS learns in a meta-game in which each meta-step is an episode of the underlying game. The meta-state consists of the policies in the underlying game and the meta-policy produces a new policy to be used in the next episode. M-FOS then uses generic model-free optimisation methods to learn meta-policies that accomplish long-horizon opponent shaping. Empirically, M-FOS near-optimally exploits naive learners and other, more sophisticated algorithms from the literature. For example, to the best of our knowledge, it is the first method to learn the well-known ZD extortion strategy in the IPD. In the same settings, M-FOS leads to socially optimal outcomes under meta-self-play. Finally, we show that M-FOS can be scaled to high-dimensional settings.", "bibtex": "@InProceedings{pmlr-v162-lu22d,\n title = \t {Model-Free Opponent Shaping},\n author = {Lu, Christopher and Willi, Timon and De Witt, Christian A Schroeder and Foerster, Jakob},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14398--14411},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lu22d/lu22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/lu22d.html},\n abstract = \t {In general-sum games the interaction of self-interested learning agents commonly leads to collectively worst-case outcomes, such as defect-defect in the iterated prisoner\u2019s dilemma (IPD). To overcome this, some methods, such as Learning with Opponent-Learning Awareness (LOLA), directly shape the learning process of their opponents. However, these methods are myopic since only a small number of steps can be anticipated, are asymmetric since they treat other agents as naive learners, and require the use of higher-order derivatives, which are calculated through white-box access to an opponent\u2019s differentiable learning algorithm. To address these issues, we propose Model-Free Opponent Shaping (M-FOS). M-FOS learns in a meta-game in which each meta-step is an episode of the underlying game. The meta-state consists of the policies in the underlying game and the meta-policy produces a new policy to be used in the next episode. M-FOS then uses generic model-free optimisation methods to learn meta-policies that accomplish long-horizon opponent shaping. Empirically, M-FOS near-optimally exploits naive learners and other, more sophisticated algorithms from the literature. For example, to the best of our knowledge, it is the first method to learn the well-known ZD extortion strategy in the IPD. In the same settings, M-FOS leads to socially optimal outcomes under meta-self-play. Finally, we show that M-FOS can be scaled to high-dimensional settings.}\n}", "pdf": "https://proceedings.mlr.press/v162/lu22d/lu22d.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/lu22d-supp.zip", "pdf_size": 1018214, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2936183608022340062&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Engineering Sciences, University of Oxford; Department of Engineering Sciences, University of Oxford; Department of Engineering Sciences, University of Oxford; Department of Engineering Sciences, University of Oxford", "aff_domain": "exeter.ox.ac.uk;exeter.ox.ac.uk; ; ", "email": "exeter.ox.ac.uk;exeter.ox.ac.uk; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lu22d.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "Department of Engineering Sciences", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Model-Value Inconsistency as a Signal for Epistemic Uncertainty", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17965", "id": "17965", "proceeding": "https://proceedings.mlr.press/v162/filos22a.html", "poster": "/media/PosterPDFs/ICML%202022/add5aebfcb33a2206b6497d53bc4f309_rHI8zg4.png?t=1657880316.529055", "slides": "", "author_site": "Angelos Filos, Eszter V\u00e9rtes, Zita Marinho, Gregory Farquhar, Diana Borsa, Abe Friesen, Feryal Behbahani, Tom Schaul, Andre Barreto, Simon Osindero", "author": "Angelos Filos; Eszter V\u00e9rtes; Zita Marinho; Gregory Farquhar; Diana Borsa; Abram Friesen; Feryal Behbahani; Tom Schaul; Andre Barreto; Simon Osindero", "abstract": "Using a model of the environment and a value function, an agent can construct many estimates of a state\u2019s value, by unrolling the model for different lengths and bootstrapping with its value function. Our key insight is that one can treat this set of value estimates as a type of ensemble, which we call an implicit value ensemble (IVE). Consequently, the discrepancy between these estimates can be used as a proxy for the agent\u2019s epistemic uncertainty; we term this signal model-value inconsistency or self-inconsistency for short. Unlike prior work which estimates uncertainty by training an ensemble of many models and/or value functions, this approach requires only the single model and value function which are already being learned in most model-based reinforcement learning algorithms. We provide empirical evidence in both tabular and function approximation settings from pixels that self-inconsistency is useful (i) as a signal for exploration, (ii) for acting safely under distribution shifts, and (iii) for robustifying value-based planning with a learned model.", "bibtex": "@InProceedings{pmlr-v162-filos22a,\n title = \t {Model-Value Inconsistency as a Signal for Epistemic Uncertainty},\n author = {Filos, Angelos and V{\\'e}rtes, Eszter and Marinho, Zita and Farquhar, Gregory and Borsa, Diana and Friesen, Abram and Behbahani, Feryal and Schaul, Tom and Barreto, Andre and Osindero, Simon},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6474--6498},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/filos22a/filos22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/filos22a.html},\n abstract = \t {Using a model of the environment and a value function, an agent can construct many estimates of a state\u2019s value, by unrolling the model for different lengths and bootstrapping with its value function. Our key insight is that one can treat this set of value estimates as a type of ensemble, which we call an implicit value ensemble (IVE). Consequently, the discrepancy between these estimates can be used as a proxy for the agent\u2019s epistemic uncertainty; we term this signal model-value inconsistency or self-inconsistency for short. Unlike prior work which estimates uncertainty by training an ensemble of many models and/or value functions, this approach requires only the single model and value function which are already being learned in most model-based reinforcement learning algorithms. We provide empirical evidence in both tabular and function approximation settings from pixels that self-inconsistency is useful (i) as a signal for exploration, (ii) for acting safely under distribution shifts, and (iii) for robustifying value-based planning with a learned model.}\n}", "pdf": "https://proceedings.mlr.press/v162/filos22a/filos22a.pdf", "supp": "", "pdf_size": 12472999, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14119756197418919418&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind + University of Oxford", "aff_domain": "cs.ox.ac.uk; ; ; ; ; ; ; ; ; ", "email": "cs.ox.ac.uk; ; ; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 10, "oa": "https://proceedings.mlr.press/v162/filos22a.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0;0+1", "aff_unique_norm": "DeepMind;University of Oxford", "aff_unique_dep": ";", "aff_unique_url": "https://deepmind.com;https://www.ox.ac.uk", "aff_unique_abbr": "DeepMind;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0+0", "aff_country_unique": "United Kingdom" }, { "title": "Model-based Meta Reinforcement Learning using Graph Structured Surrogate Models and Amortized Policy Search", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17729", "id": "17729", "proceeding": "https://proceedings.mlr.press/v162/wang22z.html", "poster": "", "slides": "", "author_site": "Qi Wang, Herke van Hoof", "author": "Qi Wang; Herke Van Hoof", "abstract": "Reinforcement learning is a promising paradigm for solving sequential decision-making problems, but low data efficiency and weak generalization across tasks are bottlenecks in real-world applications. Model-based meta reinforcement learning addresses these issues by learning dynamics and leveraging knowledge from prior experience. In this paper, we take a closer look at this framework and propose a new posterior sampling based approach that consists of a new model to identify task dynamics together with an amortized policy optimization step. We show that our model, called a graph structured surrogate model (GSSM), achieves competitive dynamics prediction performance with lower model complexity. Moreover, our approach in policy search is able to obtain high returns and allows fast execution by avoiding test-time policy gradient updates.", "bibtex": "@InProceedings{pmlr-v162-wang22z,\n title = \t {Model-based Meta Reinforcement Learning using Graph Structured Surrogate Models and Amortized Policy Search},\n author = {Wang, Qi and Van Hoof, Herke},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23055--23077},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22z/wang22z.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22z.html},\n abstract = \t {Reinforcement learning is a promising paradigm for solving sequential decision-making problems, but low data efficiency and weak generalization across tasks are bottlenecks in real-world applications. Model-based meta reinforcement learning addresses these issues by learning dynamics and leveraging knowledge from prior experience. In this paper, we take a closer look at this framework and propose a new posterior sampling based approach that consists of a new model to identify task dynamics together with an amortized policy optimization step. We show that our model, called a graph structured surrogate model (GSSM), achieves competitive dynamics prediction performance with lower model complexity. Moreover, our approach in policy search is able to obtain high returns and allows fast execution by avoiding test-time policy gradient updates.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22z/wang22z.pdf", "supp": "", "pdf_size": 2217697, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17089104974872746972&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Amsterdam Machine Learning Lab, University of Amsterdam, Amsterdam, the Netherlands; Amsterdam Machine Learning Lab, University of Amsterdam, Amsterdam, the Netherlands", "aff_domain": "uva.nl; ", "email": "uva.nl; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/wang22z.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Amsterdam", "aff_unique_dep": "Amsterdam Machine Learning Lab", "aff_unique_url": "https://www.uva.nl", "aff_unique_abbr": "UvA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Amsterdam", "aff_country_unique_index": "0;0", "aff_country_unique": "Netherlands" }, { "title": "Modeling Adversarial Noise for Adversarial Training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17661", "id": "17661", "proceeding": "https://proceedings.mlr.press/v162/zhou22k.html", "poster": "/media/PosterPDFs/ICML%202022/d63c4a5e9b600279c3da776f6113a400_Eau9f8r.png?t=1657167668.954754", "slides": "", "author_site": "Dawei Zhou, Nannan Wang, Bo Han, Tongliang Liu", "author": "Dawei Zhou; Nannan Wang; Bo Han; Tongliang Liu", "abstract": "Deep neural networks have been demonstrated to be vulnerable to adversarial noise, promoting the development of defense against adversarial attacks. Motivated by the fact that adversarial noise contains well-generalizing features and that the relationship between adversarial data and natural data can help infer natural data and make reliable predictions, in this paper, we study to model adversarial noise by learning the transition relationship between adversarial labels (i.e. the flipped labels used to generate adversarial data) and natural labels (i.e. the ground truth labels of the natural data). Specifically, we introduce an instance-dependent transition matrix to relate adversarial labels and natural labels, which can be seamlessly embedded with the target model (enabling us to model stronger adaptive adversarial noise). Empirical evaluations demonstrate that our method could effectively improve adversarial accuracy.", "bibtex": "@InProceedings{pmlr-v162-zhou22k,\n title = \t {Modeling Adversarial Noise for Adversarial Training},\n author = {Zhou, Dawei and Wang, Nannan and Han, Bo and Liu, Tongliang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27353--27366},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22k/zhou22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22k.html},\n abstract = \t {Deep neural networks have been demonstrated to be vulnerable to adversarial noise, promoting the development of defense against adversarial attacks. Motivated by the fact that adversarial noise contains well-generalizing features and that the relationship between adversarial data and natural data can help infer natural data and make reliable predictions, in this paper, we study to model adversarial noise by learning the transition relationship between adversarial labels (i.e. the flipped labels used to generate adversarial data) and natural labels (i.e. the ground truth labels of the natural data). Specifically, we introduce an instance-dependent transition matrix to relate adversarial labels and natural labels, which can be seamlessly embedded with the target model (enabling us to model stronger adaptive adversarial noise). Empirical evaluations demonstrate that our method could effectively improve adversarial accuracy.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22k/zhou22k.pdf", "supp": "", "pdf_size": 1568113, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6688229047921425158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "ISN Lab, School of Telecommunications Engineering, Xidian University + TML Lab, Sydney AI Centre, The University of Sydney; ISN Lab, School of Telecommunications Engineering, Xidian University; Department of Computer Science, Hong Kong Baptist University; TML Lab, Sydney AI Centre, The University of Sydney", "aff_domain": "gmail.com;xidian.edu.cn;comp.hkbu.edu.hk;sydney.edu.au", "email": "gmail.com;xidian.edu.cn;comp.hkbu.edu.hk;sydney.edu.au", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhou22k.html", "aff_unique_index": "0+1;0;2;1", "aff_unique_norm": "Xidian University;University of Sydney;Hong Kong Baptist University", "aff_unique_dep": "School of Telecommunications Engineering;TML Lab;Department of Computer Science", "aff_unique_url": "http://www.xidian.edu.cn/;https://www.sydney.edu.au;https://www.hkbu.edu.hk", "aff_unique_abbr": "Xidian;USYD;HKBU", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Sydney;Hong Kong SAR", "aff_country_unique_index": "0+1;0;0;1", "aff_country_unique": "China;Australia" }, { "title": "Modeling Irregular Time Series with Continuous Recurrent Units", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16343", "id": "16343", "proceeding": "https://proceedings.mlr.press/v162/schirmer22a.html", "poster": "/media/PosterPDFs/ICML%202022/5b4130c9e891d39891289001cc97d86b.png?t=1657906969.4038186", "slides": "/media/icml-2022/Slides/16343.pdf", "author_site": "Mona Schirmer, Mazin Eltayeb, Stefan Lessmann, Maja Rudolph", "author": "Mona Schirmer; Mazin Eltayeb; Stefan Lessmann; Maja Rudolph", "abstract": "Recurrent neural networks (RNNs) are a popular choice for modeling sequential data. Modern RNN architectures assume constant time-intervals between observations. However, in many datasets (e.g. medical records) observation times are irregular and can carry important information. To address this challenge, we propose continuous recurrent units (CRUs) {\u2013} a neural architecture that can naturally handle irregular intervals between observations. The CRU assumes a hidden state, which evolves according to a linear stochastic differential equation and is integrated into an encoder-decoder framework. The recursive computations of the CRU can be derived using the continuous-discrete Kalman filter and are in closed form. The resulting recurrent architecture has temporal continuity between hidden states and a gating mechanism that can optimally integrate noisy observations. We derive an efficient parameterization scheme for the CRU that leads to a fast implementation f-CRU. We empirically study the CRU on a number of challenging datasets and find that it can interpolate irregular time series better than methods based on neural ordinary differential equations.", "bibtex": "@InProceedings{pmlr-v162-schirmer22a,\n title = \t {Modeling Irregular Time Series with Continuous Recurrent Units},\n author = {Schirmer, Mona and Eltayeb, Mazin and Lessmann, Stefan and Rudolph, Maja},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19388--19405},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/schirmer22a/schirmer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/schirmer22a.html},\n abstract = \t {Recurrent neural networks (RNNs) are a popular choice for modeling sequential data. Modern RNN architectures assume constant time-intervals between observations. However, in many datasets (e.g. medical records) observation times are irregular and can carry important information. To address this challenge, we propose continuous recurrent units (CRUs) {\u2013} a neural architecture that can naturally handle irregular intervals between observations. The CRU assumes a hidden state, which evolves according to a linear stochastic differential equation and is integrated into an encoder-decoder framework. The recursive computations of the CRU can be derived using the continuous-discrete Kalman filter and are in closed form. The resulting recurrent architecture has temporal continuity between hidden states and a gating mechanism that can optimally integrate noisy observations. We derive an efficient parameterization scheme for the CRU that leads to a fast implementation f-CRU. We empirically study the CRU on a number of challenging datasets and find that it can interpolate irregular time series better than methods based on neural ordinary differential equations.}\n}", "pdf": "https://proceedings.mlr.press/v162/schirmer22a/schirmer22a.pdf", "supp": "", "pdf_size": 1270899, "gs_citation": 115, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7564792311041526490&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Humboldt-Universit \u00a8at zu Berlin, Germany+ Bosch Center for AI, Germany; Bosch Center for AI, Germany; Humboldt-Universit \u00a8at zu Berlin, Germany; Bosch Center for AI, USA", "aff_domain": "ensae.fr; ; ;us.bosch.com", "email": "ensae.fr; ; ;us.bosch.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/schirmer22a.html", "aff_unique_index": "0+1;1;0;1", "aff_unique_norm": "Humboldt University of Berlin;Bosch Center for AI", "aff_unique_dep": ";AI", "aff_unique_url": "https://www.hu-berlin.de;https://www.bosch-ai.com", "aff_unique_abbr": "HU Berlin;BCAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;1", "aff_country_unique": "Germany;United States" }, { "title": "Modeling Strong and Human-Like Gameplay with KL-Regularized Search", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16681", "id": "16681", "proceeding": "https://proceedings.mlr.press/v162/jacob22a.html", "poster": "/media/PosterPDFs/ICML%202022/0233f3bb964cf325a30f8b1c2ed2da93.png?t=1658247660.800565", "slides": "", "author_site": "Athul Paul Jacob, David Wu, Gabriele Farina, Adam Lerer, Hengyuan Hu, Anton Bakhtin, Jacob Andreas, Noam Brown", "author": "Athul Paul Jacob; David J Wu; Gabriele Farina; Adam Lerer; Hengyuan Hu; Anton Bakhtin; Jacob Andreas; Noam Brown", "abstract": "We consider the task of accurately modeling strong human policies in multi-agent decision-making problems, given examples of human behavior. Imitation learning is effective at predicting human actions but may not match the strength of expert humans (e.g., by sometimes committing blunders), while self-play learning and search techniques such as AlphaZero lead to strong performance but may produce policies that differ markedly from human behavior. In chess and Go, we show that regularized search algorithms that penalize KL divergence from an imitation-learned policy yield higher prediction accuracy of strong humans and better performance than imitation learning alone. We then introduce a novel regret minimization algorithm that is regularized based on the KL divergence from an imitation-learned policy, and show that using this algorithm for search in no-press Diplomacy yields a policy that matches the human prediction accuracy of imitation learning while being substantially stronger.", "bibtex": "@InProceedings{pmlr-v162-jacob22a,\n title = \t {Modeling Strong and Human-Like Gameplay with {KL}-Regularized Search},\n author = {Jacob, Athul Paul and Wu, David J and Farina, Gabriele and Lerer, Adam and Hu, Hengyuan and Bakhtin, Anton and Andreas, Jacob and Brown, Noam},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9695--9728},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jacob22a/jacob22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jacob22a.html},\n abstract = \t {We consider the task of accurately modeling strong human policies in multi-agent decision-making problems, given examples of human behavior. Imitation learning is effective at predicting human actions but may not match the strength of expert humans (e.g., by sometimes committing blunders), while self-play learning and search techniques such as AlphaZero lead to strong performance but may produce policies that differ markedly from human behavior. In chess and Go, we show that regularized search algorithms that penalize KL divergence from an imitation-learned policy yield higher prediction accuracy of strong humans and better performance than imitation learning alone. We then introduce a novel regret minimization algorithm that is regularized based on the KL divergence from an imitation-learned policy, and show that using this algorithm for search in no-press Diplomacy yields a policy that matches the human prediction accuracy of imitation learning while being substantially stronger.}\n}", "pdf": "https://proceedings.mlr.press/v162/jacob22a/jacob22a.pdf", "supp": "", "pdf_size": 900283, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8580789180356044815&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Meta AI Research, New York, NY, USA+CSAIL, MIT, Cambridge, MA, USA; Meta AI Research, New York, NY, USA; School of Computer Science, Carnegie Mellon University, Pittsburgh, PA, USA; Meta AI Research, New York, NY, USA; Meta AI Research, New York, NY, USA; Meta AI Research, New York, NY, USA; CSAIL, MIT, Cambridge, MA, USA; Meta AI Research, New York, NY, USA", "aff_domain": "mit.edu;fb.com;cs.cmu.edu; ; ; ;mit.edu;fb.com", "email": "mit.edu;fb.com;cs.cmu.edu; ; ; ;mit.edu;fb.com", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/jacob22a.html", "aff_unique_index": "0+1;0;2;0;0;0;1;0", "aff_unique_norm": "Meta;Massachusetts Institute of Technology;Carnegie Mellon University", "aff_unique_dep": "Meta AI Research;Computer Science and Artificial Intelligence Laboratory;School of Computer Science", "aff_unique_url": "https://meta.ai;https://www.csail.mit.edu;https://www.cmu.edu", "aff_unique_abbr": "Meta AI;MIT;CMU", "aff_campus_unique_index": "0+1;0;2;0;0;0;1;0", "aff_campus_unique": "New York;Cambridge;Pittsburgh", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Modeling Structure with Undirected Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16307", "id": "16307", "proceeding": "https://proceedings.mlr.press/v162/mihaylova22a.html", "poster": "/media/PosterPDFs/ICML%202022/c7558e9d1f956b016d1fdba7ea132378.png?t=1657476973.6907506", "slides": "/media/icml-2022/Slides/16307.pdf", "author_site": "Tsvetomila Mihaylova, Vlad Niculae, Andre Filipe Torres Martins", "author": "Tsvetomila Mihaylova; Vlad Niculae; Andre Martins", "abstract": "Neural networks are powerful function estimators, leading to their status as a paradigm of choice for modeling structured data. However, unlike other structured representations that emphasize the modularity of the problem {\u2013} e.g., factor graphs {\u2013} neural networks are usually monolithic mappings from inputs to outputs, with a fixed computation order. This limitation prevents them from capturing different directions of computation and interaction between the modeled variables. In this paper, we combine the representational strengths of factor graphs and of neural networks, proposing undirected neural networks (UNNs): a flexible framework for specifying computations that can be performed in any order. For particular choices, our proposed models subsume and extend many existing architectures: feed-forward, recurrent, self-attention networks, auto-encoders, and networks with implicit layers. We demonstrate the effectiveness of undirected neural architectures, both unstructured and structured, on a range of tasks: tree-constrained dependency parsing, convolutional image classification, and sequence completion with attention. By varying the computation order, we show how a single UNN can be used both as a classifier and a prototype generator, and how it can fill in missing parts of an input sequence, making them a promising field for further research.", "bibtex": "@InProceedings{pmlr-v162-mihaylova22a,\n title = \t {Modeling Structure with Undirected Neural Networks},\n author = {Mihaylova, Tsvetomila and Niculae, Vlad and Martins, Andre},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15544--15560},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mihaylova22a/mihaylova22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mihaylova22a.html},\n abstract = \t {Neural networks are powerful function estimators, leading to their status as a paradigm of choice for modeling structured data. However, unlike other structured representations that emphasize the modularity of the problem {\u2013} e.g., factor graphs {\u2013} neural networks are usually monolithic mappings from inputs to outputs, with a fixed computation order. This limitation prevents them from capturing different directions of computation and interaction between the modeled variables. In this paper, we combine the representational strengths of factor graphs and of neural networks, proposing undirected neural networks (UNNs): a flexible framework for specifying computations that can be performed in any order. For particular choices, our proposed models subsume and extend many existing architectures: feed-forward, recurrent, self-attention networks, auto-encoders, and networks with implicit layers. We demonstrate the effectiveness of undirected neural architectures, both unstructured and structured, on a range of tasks: tree-constrained dependency parsing, convolutional image classification, and sequence completion with attention. By varying the computation order, we show how a single UNN can be used both as a classifier and a prototype generator, and how it can fill in missing parts of an input sequence, making them a promising field for further research.}\n}", "pdf": "https://proceedings.mlr.press/v162/mihaylova22a/mihaylova22a.pdf", "supp": "", "pdf_size": 872192, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2812799179011776020&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Instituto de Telecomunica\u00e7\u00f5es, Instituto Superior T\u00e9cnico, Lisbon, Portugal+LUMLIS, Lisbon ELLIS Unit, Portugal+Unbabel, Lisbon, Portugal; Language Technology Lab, University of Amsterdam, The Netherlands; Instituto de Telecomunica\u00e7\u00f5es, Instituto Superior T\u00e9cnico, Lisbon, Portugal+LUMLIS, Lisbon ELLIS Unit, Portugal+Unbabel, Lisbon, Portugal", "aff_domain": "tecnico.ulisboa.pt; ; ", "email": "tecnico.ulisboa.pt; ; ", "github": "https://github.com/deep-spin/unn", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/mihaylova22a.html", "aff_unique_index": "0+1+2;3;0+1+2", "aff_unique_norm": "Instituto Superior T\u00e9cnico;Lisbon ELLIS Unit;Unbabel;University of Amsterdam", "aff_unique_dep": "Instituto de Telecomunica\u00e7\u00f5es;LUMLIS;;Language Technology Lab", "aff_unique_url": "https://www.ist.utl.pt;;https://www.unbabel.com;https://www.uva.nl", "aff_unique_abbr": "IST;;;UvA", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Lisbon;", "aff_country_unique_index": "0+0+0;1;0+0+0", "aff_country_unique": "Portugal;Netherlands" }, { "title": "Modular Conformal Calibration", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17971", "id": "17971", "proceeding": "https://proceedings.mlr.press/v162/marx22a.html", "poster": "/media/PosterPDFs/ICML%202022/ac64504cc249b070772848642cffe6ff.png?t=1658140484.4424984", "slides": "", "author_site": "Charles Marx, Shengjia Zhao, Willie Neiswanger, Stefano Ermon", "author": "Charles Marx; Shengjia Zhao; Willie Neiswanger; Stefano Ermon", "abstract": "Uncertainty estimates must be calibrated (i.e., accurate) and sharp (i.e., informative) in order to be useful. This has motivated a variety of methods for", "bibtex": "@InProceedings{pmlr-v162-marx22a,\n title = \t {Modular Conformal Calibration},\n author = {Marx, Charles and Zhao, Shengjia and Neiswanger, Willie and Ermon, Stefano},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15180--15195},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/marx22a/marx22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/marx22a.html},\n abstract = \t {Uncertainty estimates must be calibrated (i.e., accurate) and sharp (i.e., informative) in order to be useful. This has motivated a variety of methods for", "pdf": "https://proceedings.mlr.press/v162/marx22a/marx22a.pdf", "supp": "", "pdf_size": 578419, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17811738478475717016&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Computer Science Department, Stanford University; Computer Science Department, Stanford University; Computer Science Department, Stanford University; Computer Science Department, Stanford University", "aff_domain": "stanford.edu; ; ; ", "email": "stanford.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/marx22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Molecular Representation Learning via Heterogeneous Motif Graph Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17131", "id": "17131", "proceeding": "https://proceedings.mlr.press/v162/yu22a.html", "poster": "/media/PosterPDFs/ICML%202022/d1e96978c6935ec01d995b1b8e4d8c33.png?t=1657228889.4075267", "slides": "/media/icml-2022/Slides/17131.pdf", "author_site": "Zhaoning Yu, Hongyang Gao", "author": "Zhaoning Yu; Hongyang Gao", "abstract": "We consider feature representation learning problem of molecular graphs. Graph Neural Networks have been widely used in feature representation learning of molecular graphs. However, most existing methods deal with molecular graphs individually while neglecting their connections, such as motif-level relationships. We propose a novel molecular graph representation learning method by constructing a heterogeneous motif graph to address this issue. In particular, we build a heterogeneous motif graph that contains motif nodes and molecular nodes. Each motif node corresponds to a motif extracted from molecules. Then, we propose a Heterogeneous Motif Graph Neural Network (HM-GNN) to learn feature representations for each node in the heterogeneous motif graph. Our heterogeneous motif graph also enables effective multi-task learning, especially for small molecular datasets. To address the potential efficiency issue, we propose to use an edge sampler, which can significantly reduce computational resources usage. The experimental results show that our model consistently outperforms previous state-of-the-art models. Under multi-task settings, the promising performances of our methods on combined datasets shed light on a new learning paradigm for small molecular datasets. Finally, we show that our model achieves similar performances with significantly less computational resources by using our edge sampler.", "bibtex": "@InProceedings{pmlr-v162-yu22a,\n title = \t {Molecular Representation Learning via Heterogeneous Motif Graph Neural Networks},\n author = {Yu, Zhaoning and Gao, Hongyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25581--25594},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yu22a/yu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yu22a.html},\n abstract = \t {We consider feature representation learning problem of molecular graphs. Graph Neural Networks have been widely used in feature representation learning of molecular graphs. However, most existing methods deal with molecular graphs individually while neglecting their connections, such as motif-level relationships. We propose a novel molecular graph representation learning method by constructing a heterogeneous motif graph to address this issue. In particular, we build a heterogeneous motif graph that contains motif nodes and molecular nodes. Each motif node corresponds to a motif extracted from molecules. Then, we propose a Heterogeneous Motif Graph Neural Network (HM-GNN) to learn feature representations for each node in the heterogeneous motif graph. Our heterogeneous motif graph also enables effective multi-task learning, especially for small molecular datasets. To address the potential efficiency issue, we propose to use an edge sampler, which can significantly reduce computational resources usage. The experimental results show that our model consistently outperforms previous state-of-the-art models. Under multi-task settings, the promising performances of our methods on combined datasets shed light on a new learning paradigm for small molecular datasets. Finally, we show that our model achieves similar performances with significantly less computational resources by using our edge sampler.}\n}", "pdf": "https://proceedings.mlr.press/v162/yu22a/yu22a.pdf", "supp": "", "pdf_size": 493755, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16142260161361576450&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Iowa State University, Ames, the United State of America; Department of Computer Science, Iowa State University, Ames, the United State of America", "aff_domain": "iastate.edu;iastate.edu", "email": "iastate.edu;iastate.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/yu22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Iowa State University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.iastate.edu", "aff_unique_abbr": "ISU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ames", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Monarch: Expressive Structured Matrices for Efficient and Accurate Training", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17899", "id": "17899", "proceeding": "https://proceedings.mlr.press/v162/dao22a.html", "poster": "/media/PosterPDFs/ICML%202022/66808e327dc79d135ba18e051673d906.png?t=1657952271.9114769", "slides": "", "author_site": "Tri Dao, Beidi Chen, Nimit Sohoni, Arjun Desai, Michael Poli, Jessica Grogan, Alexander Liu, Aniruddh Rao, Atri Rudra, Christopher Re", "author": "Tri Dao; Beidi Chen; Nimit S Sohoni; Arjun Desai; Michael Poli; Jessica Grogan; Alexander Liu; Aniruddh Rao; Atri Rudra; Christopher Re", "abstract": "Large neural networks excel in many domains, but they are expensive to train and fine-tune. A popular approach to reduce their compute or memory requirements is to replace dense weight matrices with structured ones (e.g., sparse, low-rank, Fourier transform). These methods have not seen widespread adoption (1) in end-to-end training due to unfavorable efficiency\u2013quality tradeoffs, and (2) in dense-to-sparse fine-tuning due to lack of tractable algorithms to approximate a given dense weight matrix. To address these issues, we propose a class of matrices (Monarch) that is hardware-efficient (they are parameterized as products of two block-diagonal matrices for better hardware utilization) and expressive (they can represent many commonly used transforms). Surprisingly, the problem of approximating a dense weight matrix with a Monarch matrix, though nonconvex, has an analytical optimal solution. These properties of Monarch matrices unlock new ways to train and fine-tune sparse and dense models. We empirically validate that Monarch can achieve favorable accuracy-efficiency tradeoffs in several end-to-end sparse training applications: speeding up ViT and GPT-2 training on ImageNet classification and Wikitext-103 language modeling by 2x with comparable model quality, and reducing the error on PDE solving and MRI reconstruction tasks by 40%. In sparse-to-dense training, with a simple technique called \"reverse sparsification,\" Monarch matrices serve as a useful intermediate representation to speed up GPT-2 pretraining on OpenWebText by 2x without quality drop. The same technique brings 23% faster BERT pretraining than even the very optimized implementation from Nvidia that set the MLPerf 1.1 record. In dense-to-sparse fine-tuning, as a proof-of-concept, our Monarch approximation algorithm speeds up BERT fine-tuning on GLUE by 1.7x with comparable accuracy.", "bibtex": "@InProceedings{pmlr-v162-dao22a,\n title = \t {Monarch: Expressive Structured Matrices for Efficient and Accurate Training},\n author = {Dao, Tri and Chen, Beidi and Sohoni, Nimit S and Desai, Arjun and Poli, Michael and Grogan, Jessica and Liu, Alexander and Rao, Aniruddh and Rudra, Atri and Re, Christopher},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4690--4721},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dao22a/dao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dao22a.html},\n abstract = \t {Large neural networks excel in many domains, but they are expensive to train and fine-tune. A popular approach to reduce their compute or memory requirements is to replace dense weight matrices with structured ones (e.g., sparse, low-rank, Fourier transform). These methods have not seen widespread adoption (1) in end-to-end training due to unfavorable efficiency\u2013quality tradeoffs, and (2) in dense-to-sparse fine-tuning due to lack of tractable algorithms to approximate a given dense weight matrix. To address these issues, we propose a class of matrices (Monarch) that is hardware-efficient (they are parameterized as products of two block-diagonal matrices for better hardware utilization) and expressive (they can represent many commonly used transforms). Surprisingly, the problem of approximating a dense weight matrix with a Monarch matrix, though nonconvex, has an analytical optimal solution. These properties of Monarch matrices unlock new ways to train and fine-tune sparse and dense models. We empirically validate that Monarch can achieve favorable accuracy-efficiency tradeoffs in several end-to-end sparse training applications: speeding up ViT and GPT-2 training on ImageNet classification and Wikitext-103 language modeling by 2x with comparable model quality, and reducing the error on PDE solving and MRI reconstruction tasks by 40%. In sparse-to-dense training, with a simple technique called \"reverse sparsification,\" Monarch matrices serve as a useful intermediate representation to speed up GPT-2 pretraining on OpenWebText by 2x without quality drop. The same technique brings 23% faster BERT pretraining than even the very optimized implementation from Nvidia that set the MLPerf 1.1 record. In dense-to-sparse fine-tuning, as a proof-of-concept, our Monarch approximation algorithm speeds up BERT fine-tuning on GLUE by 1.7x with comparable accuracy.}\n}", "pdf": "https://proceedings.mlr.press/v162/dao22a/dao22a.pdf", "supp": "", "pdf_size": 5030951, "gs_citation": 111, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=908299519413693348&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Stanford University, USA; Stanford University, USA + Carnegie Mellon University, USA; Stanford University, USA; Stanford University, USA; Stanford University, USA; University at Buffalo, SUNY, USA; University of Michigan, USA; University of Michigan, USA; University at Buffalo, SUNY, USA; Stanford University, USA", "aff_domain": "cs.stanford.edu; ; ; ; ; ; ; ; ; ", "email": "cs.stanford.edu; ; ; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 10, "oa": "https://proceedings.mlr.press/v162/dao22a.html", "aff_unique_index": "0;0+1;0;0;0;2;3;3;2;0", "aff_unique_norm": "Stanford University;Carnegie Mellon University;University at Buffalo;University of Michigan", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.stanford.edu;https://www.cmu.edu;https://www.buffalo.edu;https://www.umich.edu", "aff_unique_abbr": "Stanford;CMU;UB;UM", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0+0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "More Efficient Sampling for Tensor Decomposition With Worst-Case Guarantees", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16645", "id": "16645", "proceeding": "https://proceedings.mlr.press/v162/malik22a.html", "poster": "/media/PosterPDFs/ICML%202022/d7a84628c025d30f7b2c52c958767e76.png?t=1657470066.5101268", "slides": "", "author": "Osman Asif Malik", "abstract": "Recent papers have developed alternating least squares (ALS) methods for CP and tensor ring decomposition with a per-iteration cost which is sublinear in the number of input tensor entries for low-rank decomposition. However, the per-iteration cost of these methods still has an exponential dependence on the number of tensor modes when parameters are chosen to achieve certain worst-case guarantees. In this paper, we propose sampling-based ALS methods for the CP and tensor ring decompositions whose cost does not have this exponential dependence, thereby significantly improving on the previous state-of-the-art. We provide a detailed theoretical analysis and also apply the methods in a feature extraction experiment.", "bibtex": "@InProceedings{pmlr-v162-malik22a,\n title = \t {More Efficient Sampling for Tensor Decomposition With Worst-Case Guarantees},\n author = {Malik, Osman Asif},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14887--14917},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/malik22a/malik22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/malik22a.html},\n abstract = \t {Recent papers have developed alternating least squares (ALS) methods for CP and tensor ring decomposition with a per-iteration cost which is sublinear in the number of input tensor entries for low-rank decomposition. However, the per-iteration cost of these methods still has an exponential dependence on the number of tensor modes when parameters are chosen to achieve certain worst-case guarantees. In this paper, we propose sampling-based ALS methods for the CP and tensor ring decompositions whose cost does not have this exponential dependence, thereby significantly improving on the previous state-of-the-art. We provide a detailed theoretical analysis and also apply the methods in a feature extraction experiment.}\n}", "pdf": "https://proceedings.mlr.press/v162/malik22a/malik22a.pdf", "supp": "", "pdf_size": 1344563, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18131307988891143062&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Applied Mathematics & Computational Research Division, Lawrence Berkeley National Laboratory, Berkeley, USA", "aff_domain": "lbl.gov", "email": "lbl.gov", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/malik22a.html", "aff_unique_index": "0", "aff_unique_norm": "Lawrence Berkeley National Laboratory", "aff_unique_dep": "Applied Mathematics & Computational Research Division", "aff_unique_url": "https://www.lbl.gov", "aff_unique_abbr": "LBL", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "More Than a Toy: Random Matrix Models Predict How Real-World Neural Representations Generalize", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18327", "id": "18327", "proceeding": "https://proceedings.mlr.press/v162/wei22a.html", "poster": "/media/PosterPDFs/ICML%202022/98e6f17209029f4ae6dc9d88ec8eac2c.png?t=1658346805.8202236", "slides": "", "author_site": "Alexander Wei, Wei Hu, Jacob Steinhardt", "author": "Alexander Wei; Wei Hu; Jacob Steinhardt", "abstract": "Of theories for why large-scale machine learning models generalize despite being vastly overparameterized, which of their assumptions are needed to capture the qualitative phenomena of generalization in the real world? On one hand, we find that most theoretical analyses fall short of capturing these qualitative phenomena even for kernel regression, when applied to kernels derived from large-scale neural networks (e.g., ResNet-50) and real data (e.g., CIFAR-100). On the other hand, we find that the classical GCV estimator (Craven and Wahba, 1978) accurately predicts generalization risk even in such overparameterized settings. To bolster this empirical finding, we prove that the GCV estimator converges to the generalization risk whenever a local random matrix law holds. Finally, we apply this random matrix theory lens to explain why pretrained representations generalize better as well as what factors govern scaling laws for kernel regression. Our findings suggest that random matrix theory, rather than just being a toy model, may be central to understanding the properties of neural representations in practice.", "bibtex": "@InProceedings{pmlr-v162-wei22a,\n title = \t {More Than a Toy: Random Matrix Models Predict How Real-World Neural Representations Generalize},\n author = {Wei, Alexander and Hu, Wei and Steinhardt, Jacob},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23549--23588},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wei22a/wei22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/wei22a.html},\n abstract = \t {Of theories for why large-scale machine learning models generalize despite being vastly overparameterized, which of their assumptions are needed to capture the qualitative phenomena of generalization in the real world? On one hand, we find that most theoretical analyses fall short of capturing these qualitative phenomena even for kernel regression, when applied to kernels derived from large-scale neural networks (e.g., ResNet-50) and real data (e.g., CIFAR-100). On the other hand, we find that the classical GCV estimator (Craven and Wahba, 1978) accurately predicts generalization risk even in such overparameterized settings. To bolster this empirical finding, we prove that the GCV estimator converges to the generalization risk whenever a local random matrix law holds. Finally, we apply this random matrix theory lens to explain why pretrained representations generalize better as well as what factors govern scaling laws for kernel regression. Our findings suggest that random matrix theory, rather than just being a toy model, may be central to understanding the properties of neural representations in practice.}\n}", "pdf": "https://proceedings.mlr.press/v162/wei22a/wei22a.pdf", "supp": "", "pdf_size": 3431417, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1474486738748185643&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "UC Berkeley; UC Berkeley; UC Berkeley", "aff_domain": "berkeley.edu; ; ", "email": "berkeley.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wei22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Multi Resolution Analysis (MRA) for Approximate Self-Attention", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17209", "id": "17209", "proceeding": "https://proceedings.mlr.press/v162/zeng22a.html", "poster": "/media/PosterPDFs/ICML%202022/1c1d4df596d01da60385f0bb17a4a9e0.png?t=1658367911.0161877", "slides": "", "author_site": "Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn Fung, Vikas Singh", "author": "Zhanpeng Zeng; Sourav Pal; Jeffery Kline; Glenn M Fung; Vikas Singh", "abstract": "Transformers have emerged as a preferred model for many tasks in natural langugage processing and vision. Recent efforts on training and deploying Transformers more efficiently have identified many strategies to approximate the self-attention matrix, a key module in a Transformer architecture. Effective ideas include various prespecified sparsity patterns, low-rank basis expansions and combinations thereof. In this paper, we revisit classical Multiresolution Analysis (MRA) concepts such as Wavelets, whose potential value in this setting remains underexplored thus far. We show that simple approximations based on empirical feedback and design choices informed by modern hardware and implementation challenges, eventually yield a MRA-based approach for self-attention with an excellent performance profile across most criteria of interest. We undertake an extensive set of experiments and demonstrate that this multi-resolution scheme outperforms most efficient self-attention proposals and is favorable for both short and long sequences. Code is available at \\url{https://github.com/mlpen/mra-attention}.", "bibtex": "@InProceedings{pmlr-v162-zeng22a,\n title = \t {Multi Resolution Analysis ({MRA}) for Approximate Self-Attention},\n author = {Zeng, Zhanpeng and Pal, Sourav and Kline, Jeffery and Fung, Glenn M and Singh, Vikas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25955--25972},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zeng22a/zeng22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zeng22a.html},\n abstract = \t {Transformers have emerged as a preferred model for many tasks in natural langugage processing and vision. Recent efforts on training and deploying Transformers more efficiently have identified many strategies to approximate the self-attention matrix, a key module in a Transformer architecture. Effective ideas include various prespecified sparsity patterns, low-rank basis expansions and combinations thereof. In this paper, we revisit classical Multiresolution Analysis (MRA) concepts such as Wavelets, whose potential value in this setting remains underexplored thus far. We show that simple approximations based on empirical feedback and design choices informed by modern hardware and implementation challenges, eventually yield a MRA-based approach for self-attention with an excellent performance profile across most criteria of interest. We undertake an extensive set of experiments and demonstrate that this multi-resolution scheme outperforms most efficient self-attention proposals and is favorable for both short and long sequences. Code is available at \\url{https://github.com/mlpen/mra-attention}.}\n}", "pdf": "https://proceedings.mlr.press/v162/zeng22a/zeng22a.pdf", "supp": "", "pdf_size": 4886855, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=184055539633336213&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "University of Wisconsin, Madison, USA; University of Wisconsin, Madison, USA; American Family Insurance, Madison, USA; American Family Insurance, Madison, USA; University of Wisconsin, Madison, USA", "aff_domain": "wisc.edu; ; ; ; ", "email": "wisc.edu; ; ; ; ", "github": "https://github.com/mlpen/mra-attention", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zeng22a.html", "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "University of Wisconsin-Madison;American Family Insurance", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.amfam.com", "aff_unique_abbr": "UW-Madison;", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17335", "id": "17335", "proceeding": "https://proceedings.mlr.press/v162/zeng22c.html", "poster": "/media/PosterPDFs/ICML%202022/0731460a8a5ce1626210cbf4385ae0ef.png?t=1657545435.8015208", "slides": "", "author_site": "Yan Zeng, Xinsong Zhang, Hang Li", "author": "Yan Zeng; Xinsong Zhang; Hang Li", "abstract": "Most existing methods in vision language pre-training rely on object-centric features extracted through object detection and make fine-grained alignments between the extracted features and texts. It is challenging for these methods to learn relations among multiple objects. To this end, we propose a new method called X-VLM to perform \u2018multi-grained vision language pre-training.\u2019 The key to learning multi-grained alignments is to locate visual concepts in the image given the associated texts, and in the meantime align the texts with the visual concepts, where the alignments are in multi-granularity. Experimental results show that X-VLM effectively leverages the learned multi-grained alignments to many downstream vision language tasks and consistently outperforms state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v162-zeng22c,\n title = \t {Multi-Grained Vision Language Pre-Training: Aligning Texts with Visual Concepts},\n author = {Zeng, Yan and Zhang, Xinsong and Li, Hang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25994--26009},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zeng22c/zeng22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/zeng22c.html},\n abstract = \t {Most existing methods in vision language pre-training rely on object-centric features extracted through object detection and make fine-grained alignments between the extracted features and texts. It is challenging for these methods to learn relations among multiple objects. To this end, we propose a new method called X-VLM to perform \u2018multi-grained vision language pre-training.\u2019 The key to learning multi-grained alignments is to locate visual concepts in the image given the associated texts, and in the meantime align the texts with the visual concepts, where the alignments are in multi-granularity. Experimental results show that X-VLM effectively leverages the learned multi-grained alignments to many downstream vision language tasks and consistently outperforms state-of-the-art methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/zeng22c/zeng22c.pdf", "supp": "", "pdf_size": 22733188, "gs_citation": 346, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8119995839638175849&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "ByteDance AI Lab; ByteDance AI Lab; ByteDance AI Lab", "aff_domain": "bytedance.com; ; ", "email": "bytedance.com; ; ", "github": "https://github.com/zengyan-97/X-VLM", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zeng22c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "ByteDance", "aff_unique_dep": "AI Lab", "aff_unique_url": "https://www.bytedance.com", "aff_unique_abbr": "ByteDance", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Multi-Level Branched Regularization for Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18369", "id": "18369", "proceeding": "https://proceedings.mlr.press/v162/kim22a.html", "poster": "/media/PosterPDFs/ICML%202022/7f100b7b36092fb9b06dfb4fac360931.png?t=1657787151.1995988", "slides": "", "author_site": "Jinkyu Kim, Geeho Kim, Bohyung Han", "author": "Jinkyu Kim; Geeho Kim; Bohyung Han", "abstract": "A critical challenge of federated learning is data heterogeneity and imbalance across clients, which leads to inconsistency between local networks and unstable convergence of global models. To alleviate the limitations, we propose a novel architectural regularization technique that constructs multiple auxiliary branches in each local model by grafting local and global subnetworks at several different levels and that learns the representations of the main pathway in the local model congruent to the auxiliary hybrid pathways via online knowledge distillation. The proposed technique is effective to robustify the global model even in the non-iid setting and is applicable to various federated learning frameworks conveniently without incurring extra communication costs. We perform comprehensive empirical studies and demonstrate remarkable performance gains in terms of accuracy and efficiency compared to existing methods. The source code is available at our project page.", "bibtex": "@InProceedings{pmlr-v162-kim22a,\n title = \t {Multi-Level Branched Regularization for Federated Learning},\n author = {Kim, Jinkyu and Kim, Geeho and Han, Bohyung},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11058--11073},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kim22a/kim22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kim22a.html},\n abstract = \t {A critical challenge of federated learning is data heterogeneity and imbalance across clients, which leads to inconsistency between local networks and unstable convergence of global models. To alleviate the limitations, we propose a novel architectural regularization technique that constructs multiple auxiliary branches in each local model by grafting local and global subnetworks at several different levels and that learns the representations of the main pathway in the local model congruent to the auxiliary hybrid pathways via online knowledge distillation. The proposed technique is effective to robustify the global model even in the non-iid setting and is applicable to various federated learning frameworks conveniently without incurring extra communication costs. We perform comprehensive empirical studies and demonstrate remarkable performance gains in terms of accuracy and efficiency compared to existing methods. The source code is available at our project page.}\n}", "pdf": "https://proceedings.mlr.press/v162/kim22a/kim22a.pdf", "supp": "", "pdf_size": 1820355, "gs_citation": 74, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2425993830334019201&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Computer Vision Laboratory, Department of Electrical and Computer Engineering & ASRI, Seoul National University, Korea; Computer Vision Laboratory, Department of Electrical and Computer Engineering & ASRI, Seoul National University, Korea; Computer Vision Laboratory, Department of Electrical and Computer Engineering & ASRI, Seoul National University, Korea + Interdisciplinary Program of Arti\ufb01cial Intelligence, Seoul National University, Korea", "aff_domain": "snu.ac.kr; ; ", "email": "snu.ac.kr; ; ", "github": "", "project": "http://cvlab.snu.ac.kr/research/FedMLB", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kim22a.html", "aff_unique_index": "0;0;0+0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0+0", "aff_country_unique": "South Korea" }, { "title": "Multi-Task Learning as a Bargaining Game", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17113", "id": "17113", "proceeding": "https://proceedings.mlr.press/v162/navon22a.html", "poster": "/media/PosterPDFs/ICML%202022/b44afe91b8a427a6be2078cc89bd6f9b.png?t=1657275682.8856382", "slides": "", "author_site": "Aviv Navon, Aviv Shamsian, Idan Achituve, Haggai Maron, Kenji Kawaguchi, Gal Chechik, Ethan Fetaya", "author": "Aviv Navon; Aviv Shamsian; Idan Achituve; Haggai Maron; Kenji Kawaguchi; Gal Chechik; Ethan Fetaya", "abstract": "In Multi-task learning (MTL), a joint model is trained to simultaneously make predictions for several tasks. Joint training reduces computation costs and improves data efficiency; however, since the gradients of these different tasks may conflict, training a joint model for MTL often yields lower performance than its corresponding single-task counterparts. A common method for alleviating this issue is to combine per-task gradients into a joint update direction using a particular heuristic. In this paper, we propose viewing the gradients combination step as a bargaining game, where tasks negotiate to reach an agreement on a joint direction of parameter update. Under certain assumptions, the bargaining problem has a unique solution, known as the", "bibtex": "@InProceedings{pmlr-v162-navon22a,\n title = \t {Multi-Task Learning as a Bargaining Game},\n author = {Navon, Aviv and Shamsian, Aviv and Achituve, Idan and Maron, Haggai and Kawaguchi, Kenji and Chechik, Gal and Fetaya, Ethan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16428--16446},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/navon22a/navon22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/navon22a.html},\n abstract = \t {In Multi-task learning (MTL), a joint model is trained to simultaneously make predictions for several tasks. Joint training reduces computation costs and improves data efficiency; however, since the gradients of these different tasks may conflict, training a joint model for MTL often yields lower performance than its corresponding single-task counterparts. A common method for alleviating this issue is to combine per-task gradients into a joint update direction using a particular heuristic. In this paper, we propose viewing the gradients combination step as a bargaining game, where tasks negotiate to reach an agreement on a joint direction of parameter update. Under certain assumptions, the bargaining problem has a unique solution, known as the", "pdf": "https://proceedings.mlr.press/v162/navon22a/navon22a.pdf", "supp": "", "pdf_size": 7126787, "gs_citation": 175, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3841743488607196482&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Bar-Ilan University, Ramat Gan, Israel; Bar-Ilan University, Ramat Gan, Israel; Bar-Ilan University, Ramat Gan, Israel; Nvidia, Tel-Aviv, Israel; National University of Singapore; Bar-Ilan University, Ramat Gan, Israel + Nvidia, Tel-Aviv, Israel; Bar-Ilan University, Ramat Gan, Israel", "aff_domain": "biu.ac.il;live.biu.ac.il; ; ; ; ; ", "email": "biu.ac.il;live.biu.ac.il; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/navon22a.html", "aff_unique_index": "0;0;0;1;2;0+1;0", "aff_unique_norm": "Bar-Ilan University;NVIDIA;National University of Singapore", "aff_unique_dep": ";Nvidia;", "aff_unique_url": "https://www.biu.ac.il;https://www.nvidia.com;https://www.nus.edu.sg", "aff_unique_abbr": "BIU;NVDA;NUS", "aff_campus_unique_index": "0;0;0;1;0+1;0", "aff_campus_unique": "Ramat Gan;Tel-Aviv;", "aff_country_unique_index": "0;0;0;0;1;0+0;0", "aff_country_unique": "Israel;Singapore" }, { "title": "Multi-scale Feature Learning Dynamics: Insights for Double Descent", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18085", "id": "18085", "proceeding": "https://proceedings.mlr.press/v162/pezeshki22a.html", "poster": "/media/PosterPDFs/ICML%202022/44e76e99b5e194377e955b13fb12f630_4CRGv95.png?t=1657313683.5646849", "slides": "", "author_site": "Mohammad Pezeshki, Amartya Mitra, Yoshua Bengio, Guillaume Lajoie", "author": "Mohammad Pezeshki; Amartya Mitra; Yoshua Bengio; Guillaume Lajoie", "abstract": "An intriguing phenomenon that arises from the high-dimensional learning dynamics of neural networks is the phenomenon of \u201cdouble descent\u201d. The more commonly studied aspect of this phenomenon corresponds to", "bibtex": "@InProceedings{pmlr-v162-pezeshki22a,\n title = \t {Multi-scale Feature Learning Dynamics: Insights for Double Descent},\n author = {Pezeshki, Mohammad and Mitra, Amartya and Bengio, Yoshua and Lajoie, Guillaume},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17669--17690},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pezeshki22a/pezeshki22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pezeshki22a.html},\n abstract = \t {An intriguing phenomenon that arises from the high-dimensional learning dynamics of neural networks is the phenomenon of \u201cdouble descent\u201d. The more commonly studied aspect of this phenomenon corresponds to", "pdf": "https://proceedings.mlr.press/v162/pezeshki22a/pezeshki22a.pdf", "supp": "", "pdf_size": 2430000, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15892651020867127021&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/pezeshki22a.html" }, { "title": "Multi-slots Online Matching with High Entropy", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18163", "id": "18163", "proceeding": "https://proceedings.mlr.press/v162/lu22e.html", "poster": "/media/PosterPDFs/ICML%202022/6a81681a7af700c6385d36577ebec359.png?t=1656037211.9411004", "slides": "/media/icml-2022/Slides/18163_EsGc0rR.pdf", "author_site": "XINGYU LU, Qintong Wu, WENLIANG ZHONG", "author": "Xingyu Lu; Qintong Wu; Wenliang Zhong", "abstract": "Online matching with diversity and fairness pursuit, a common building block in the recommendation and advertising, can be modeled as constrained convex programming with high entropy. While most existing approaches are based on the \u201csingle slot\u201d assumption (i.e., assigning one item per iteration), they cannot be directly applied to cases with multiple slots, e.g., stock-aware top-N recommendation and advertising at multiple places. Particularly, the gradient computation and resource allocation are both challenging under this setting due to the absence of a closed-form solution. To overcome these obstacles, we develop a novel algorithm named Online subGradient descent for Multi-slots Allocation (OG-MA). It uses an efficient pooling algorithm to compute closed-form of the gradient then performs a roulette swapping for allocation, yielding a sub-linear regret with linear cost per iteration. Extensive experiments on synthetic and industrial data sets demonstrate that OG-MA is a fast and promising method for multi-slots online matching.", "bibtex": "@InProceedings{pmlr-v162-lu22e,\n title = \t {Multi-slots Online Matching with High Entropy},\n author = {Lu, Xingyu and Wu, Qintong and Zhong, Wenliang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14412--14428},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lu22e/lu22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/lu22e.html},\n abstract = \t {Online matching with diversity and fairness pursuit, a common building block in the recommendation and advertising, can be modeled as constrained convex programming with high entropy. While most existing approaches are based on the \u201csingle slot\u201d assumption (i.e., assigning one item per iteration), they cannot be directly applied to cases with multiple slots, e.g., stock-aware top-N recommendation and advertising at multiple places. Particularly, the gradient computation and resource allocation are both challenging under this setting due to the absence of a closed-form solution. To overcome these obstacles, we develop a novel algorithm named Online subGradient descent for Multi-slots Allocation (OG-MA). It uses an efficient pooling algorithm to compute closed-form of the gradient then performs a roulette swapping for allocation, yielding a sub-linear regret with linear cost per iteration. Extensive experiments on synthetic and industrial data sets demonstrate that OG-MA is a fast and promising method for multi-slots online matching.}\n}", "pdf": "https://proceedings.mlr.press/v162/lu22e/lu22e.pdf", "supp": "", "pdf_size": 460769, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9311234800733139039&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": "Ant Group, Hangzhou, China; Ant Group, Hangzhou, China; Ant Group, Hangzhou, China", "aff_domain": "antgroup.com;antgroup.com;antgroup.com", "email": "antgroup.com;antgroup.com;antgroup.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/lu22e.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Ant Group", "aff_unique_dep": "", "aff_unique_url": "https://www.antgroup.com", "aff_unique_abbr": "Ant Group", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hangzhou", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Multiclass learning with margin: exponential rates with no bias-variance trade-off", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18111", "id": "18111", "proceeding": "https://proceedings.mlr.press/v162/vigogna22a.html", "poster": "/media/PosterPDFs/ICML%202022/38ccdf8d538de2d6a6deb2ed17d1f873.png?t=1657920588.0570412", "slides": "", "author_site": "Stefano Vigogna, Giacomo Meanti, Ernesto De Vito, Lorenzo Rosasco", "author": "Stefano Vigogna; Giacomo Meanti; Ernesto De Vito; Lorenzo Rosasco", "abstract": "We study the behavior of error bounds for multiclass classification under suitable margin conditions. For a wide variety of methods we prove that the classification error under a hard-margin condition decreases exponentially fast without any bias-variance trade-off. Different convergence rates can be obtained in correspondence of different margin assumptions. With a self-contained and instructive analysis we are able to generalize known results from the binary to the multiclass setting.", "bibtex": "@InProceedings{pmlr-v162-vigogna22a,\n title = \t {Multiclass learning with margin: exponential rates with no bias-variance trade-off},\n author = {Vigogna, Stefano and Meanti, Giacomo and De Vito, Ernesto and Rosasco, Lorenzo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22260--22269},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vigogna22a/vigogna22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/vigogna22a.html},\n abstract = \t {We study the behavior of error bounds for multiclass classification under suitable margin conditions. For a wide variety of methods we prove that the classification error under a hard-margin condition decreases exponentially fast without any bias-variance trade-off. Different convergence rates can be obtained in correspondence of different margin assumptions. With a self-contained and instructive analysis we are able to generalize known results from the binary to the multiclass setting.}\n}", "pdf": "https://proceedings.mlr.press/v162/vigogna22a/vigogna22a.pdf", "supp": "", "pdf_size": 1653037, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16147491111916664188&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "RoMaDS, University of Rome Tor Vergata, Rome, Italy; MaLGa - DIBRIS, University of Genova, Italy; MaLGa - DIMA, University of Genova, Italy; Istituto Italiano di Tecnologia, Genova, Italy + CBMM - MIT, Cambridge, MA, USA", "aff_domain": "mat.uniroma2.it; ; ; ", "email": "mat.uniroma2.it; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/vigogna22a.html", "aff_unique_index": "0;1;1;2+3", "aff_unique_norm": "University of Rome Tor Vergata;University of Genova;Istituto Italiano di Tecnologia;Massachusetts Institute of Technology", "aff_unique_dep": "RoMaDS;DIBRIS;;CBMM", "aff_unique_url": "https://www.uniroma2.it;https://www.unige.it;https://www.iit.it;https://www.mit.edu", "aff_unique_abbr": "UniRoma2;UniGe;IIT;MIT", "aff_campus_unique_index": "0;2+3", "aff_campus_unique": "Rome;;Genova;Cambridge", "aff_country_unique_index": "0;0;0;0+1", "aff_country_unique": "Italy;United States" }, { "title": "Multicoated Supermasks Enhance Hidden Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18245", "id": "18245", "proceeding": "https://proceedings.mlr.press/v162/okoshi22a.html", "poster": "/media/PosterPDFs/ICML%202022/62161512d8b1b5db826778917e974b21.png?t=1657674721.619701", "slides": "", "author_site": "Yasuyuki Okoshi, \u00c1ngel L\u00f3pez Garc\u00eda-Arias, Kazutoshi Hirose, Kota Ando, Kazushi Kawamura, Thiem Van Chu, Masato Motomura, Jaehoon Yu", "author": "Yasuyuki Okoshi; \u00c1ngel L\u00f3pez Garc\u0131\u0301a-Arias; Kazutoshi Hirose; Kota Ando; Kazushi Kawamura; Thiem Van Chu; Masato Motomura; Jaehoon Yu", "abstract": "Hidden Networks (Ramanujan et al., 2020) showed the possibility of finding accurate subnetworks within a randomly weighted neural network by training a connectivity mask, referred to as supermask. We show that the supermask stops improving even though gradients are not zero, thus underutilizing backpropagated information. To address this we propose a method that extends Hidden Networks by training an overlay of multiple hierarchical supermasks{\u2014}a multicoated supermask. This method shows that using multiple supermasks for a single task achieves higher accuracy without additional training cost. Experiments on CIFAR-10 and ImageNet show that Multicoated Supermasks enhance the tradeoff between accuracy and model size. A ResNet-101 using a 7-coated supermask outperforms its Hidden Networks counterpart by 4%, matching the accuracy of a dense ResNet-50 while being an order of magnitude smaller.", "bibtex": "@InProceedings{pmlr-v162-okoshi22a,\n title = \t {Multicoated Supermasks Enhance Hidden Networks},\n author = {Okoshi, Yasuyuki and Garc\\'{\\i}a-Arias, \\'Angel L{\\'o}pez and Hirose, Kazutoshi and Ando, Kota and Kawamura, Kazushi and Van Chu, Thiem and Motomura, Masato and Yu, Jaehoon},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17045--17055},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/okoshi22a/okoshi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/okoshi22a.html},\n abstract = \t {Hidden Networks (Ramanujan et al., 2020) showed the possibility of finding accurate subnetworks within a randomly weighted neural network by training a connectivity mask, referred to as supermask. We show that the supermask stops improving even though gradients are not zero, thus underutilizing backpropagated information. To address this we propose a method that extends Hidden Networks by training an overlay of multiple hierarchical supermasks{\u2014}a multicoated supermask. This method shows that using multiple supermasks for a single task achieves higher accuracy without additional training cost. Experiments on CIFAR-10 and ImageNet show that Multicoated Supermasks enhance the tradeoff between accuracy and model size. A ResNet-101 using a 7-coated supermask outperforms its Hidden Networks counterpart by 4%, matching the accuracy of a dense ResNet-50 while being an order of magnitude smaller.}\n}", "pdf": "https://proceedings.mlr.press/v162/okoshi22a/okoshi22a.pdf", "supp": "", "pdf_size": 7564189, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3887684723864190786&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Tokyo Institute of Technology, Japan; Tokyo Institute of Technology, Japan; Tokyo Institute of Technology, Japan; Tokyo Institute of Technology, Japan; Tokyo Institute of Technology, Japan; Tokyo Institute of Technology, Japan; Tokyo Institute of Technology, Japan; Tokyo Institute of Technology, Japan", "aff_domain": "artic.iir.titech.ac.jp;artic.iir.titech.ac.jp; ; ; ; ; ;artic.iir.titech.ac.jp", "email": "artic.iir.titech.ac.jp;artic.iir.titech.ac.jp; ; ; ; ; ;artic.iir.titech.ac.jp", "github": "https://github.com/yasu0001/multicoated-supermasks", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/okoshi22a.html", "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Tokyo Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.titech.ac.jp", "aff_unique_abbr": "Titech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "Japan" }, { "title": "Multiple-Play Stochastic Bandits with Shareable Finite-Capacity Arms", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17289", "id": "17289", "proceeding": "https://proceedings.mlr.press/v162/wang22af.html", "poster": "/media/PosterPDFs/ICML%202022/1e747ddbea997a1b933aaf58a7953c3c.png?t=1656687190.6839826", "slides": "/media/icml-2022/Slides/17289.pdf", "author_site": "Xuchuang Wang, Hong Xie, John C. S. Lui", "author": "Xuchuang Wang; Hong Xie; John C. S. Lui", "abstract": "We generalize the multiple-play multi-armed bandits (MP-MAB) problem with a shareable arms setting, in which several plays can share the same arm. Furthermore, each shareable arm has a finite reward capacity and a \u201cper-load\u201d reward distribution, both of which are unknown to the learner. The reward from a shareable arm is load-dependent, which is the \u201cper-load\u201d reward multiplying either the number of plays pulling the arm, or its reward capacity when the number of plays exceeds the capacity limit. When the \u201cper-load\u201d reward follows a Gaussian distribution, we prove a sample complexity lower bound of learning the capacity from load-dependent rewards and also a regret lower bound of this new MP-MAB problem. We devise a capacity estimator whose sample complexity upper bound matches the lower bound in terms of reward means and capacities. We also propose an online learning algorithm to address the problem and prove its regret upper bound. This regret upper bound\u2019s first term is the same as regret lower bound\u2019s, and its second and third terms also evidently correspond to lower bound\u2019s. Extensive experiments validate our algorithm\u2019s performance and also its gain in 5G & 4G base station selection.", "bibtex": "@InProceedings{pmlr-v162-wang22af,\n title = \t {Multiple-Play Stochastic Bandits with Shareable Finite-Capacity Arms},\n author = {Wang, Xuchuang and Xie, Hong and Lui, John C. S.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23181--23212},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22af/wang22af.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22af.html},\n abstract = \t {We generalize the multiple-play multi-armed bandits (MP-MAB) problem with a shareable arms setting, in which several plays can share the same arm. Furthermore, each shareable arm has a finite reward capacity and a \u201cper-load\u201d reward distribution, both of which are unknown to the learner. The reward from a shareable arm is load-dependent, which is the \u201cper-load\u201d reward multiplying either the number of plays pulling the arm, or its reward capacity when the number of plays exceeds the capacity limit. When the \u201cper-load\u201d reward follows a Gaussian distribution, we prove a sample complexity lower bound of learning the capacity from load-dependent rewards and also a regret lower bound of this new MP-MAB problem. We devise a capacity estimator whose sample complexity upper bound matches the lower bound in terms of reward means and capacities. We also propose an online learning algorithm to address the problem and prove its regret upper bound. This regret upper bound\u2019s first term is the same as regret lower bound\u2019s, and its second and third terms also evidently correspond to lower bound\u2019s. Extensive experiments validate our algorithm\u2019s performance and also its gain in 5G & 4G base station selection.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22af/wang22af.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wang22af-supp.zip", "pdf_size": 1198434, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12853471905315772628&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science & Engineering, The Chinese University of Hong Kong; College of Computer Science, Chongqing University, China; Department of Computer Science & Engineering, The Chinese University of Hong Kong", "aff_domain": "cse.cuhk.edu.hk;foxmail.com;cse.cuhk.edu.hk", "email": "cse.cuhk.edu.hk;foxmail.com;cse.cuhk.edu.hk", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22af.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Chinese University of Hong Kong;Chongqing University", "aff_unique_dep": "Department of Computer Science & Engineering;College of Computer Science", "aff_unique_url": "https://www.cuhk.edu.hk;http://www.cqu.edu.cn", "aff_unique_abbr": "CUHK;CQU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Multirate Training of Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16705", "id": "16705", "proceeding": "https://proceedings.mlr.press/v162/vlaar22b.html", "poster": "/media/PosterPDFs/ICML%202022/63c4b1baf3b4460fa9936b1a20919bec.png?t=1657545177.4738724", "slides": "/media/icml-2022/Slides/16705.pdf", "author_site": "Tiffany Vlaar, Benedict Leimkuhler", "author": "Tiffany J Vlaar; Benedict Leimkuhler", "abstract": "We propose multirate training of neural networks: partitioning neural network parameters into \"fast\" and \"slow\" parts which are trained on different time scales, where slow parts are updated less frequently. By choosing appropriate partitionings we can obtain substantial computational speed-up for transfer learning tasks. We show for applications in vision and NLP that we can fine-tune deep neural networks in almost half the time, without reducing the generalization performance of the resulting models. We analyze the convergence properties of our multirate scheme and draw a comparison with vanilla SGD. We also discuss splitting choices for the neural network parameters which could enhance generalization performance when neural networks are trained from scratch. A multirate approach can be used to learn different features present in the data and as a form of regularization. Our paper unlocks the potential of using multirate techniques for neural network training and provides several starting points for future work in this area.", "bibtex": "@InProceedings{pmlr-v162-vlaar22b,\n title = \t {Multirate Training of Neural Networks},\n author = {Vlaar, Tiffany J and Leimkuhler, Benedict},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22342--22360},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vlaar22b/vlaar22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/vlaar22b.html},\n abstract = \t {We propose multirate training of neural networks: partitioning neural network parameters into \"fast\" and \"slow\" parts which are trained on different time scales, where slow parts are updated less frequently. By choosing appropriate partitionings we can obtain substantial computational speed-up for transfer learning tasks. We show for applications in vision and NLP that we can fine-tune deep neural networks in almost half the time, without reducing the generalization performance of the resulting models. We analyze the convergence properties of our multirate scheme and draw a comparison with vanilla SGD. We also discuss splitting choices for the neural network parameters which could enhance generalization performance when neural networks are trained from scratch. A multirate approach can be used to learn different features present in the data and as a form of regularization. Our paper unlocks the potential of using multirate techniques for neural network training and provides several starting points for future work in this area.}\n}", "pdf": "https://proceedings.mlr.press/v162/vlaar22b/vlaar22b.pdf", "supp": "", "pdf_size": 1661969, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14672109036130949413&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Mathematics, University of Edinburgh, Edinburgh, United Kingdom; Department of Mathematics, University of Edinburgh, Edinburgh, United Kingdom", "aff_domain": "ed.ac.uk; ", "email": "ed.ac.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/vlaar22b.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "Department of Mathematics", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Edinburgh", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "N-Penetrate: Active Learning of Neural Collision Handler for Complex 3D Mesh Deformations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16817", "id": "16817", "proceeding": "https://proceedings.mlr.press/v162/tan22b.html", "poster": "/media/PosterPDFs/ICML%202022/900c563bfd2c48c16701acca83ad858a.png?t=1658170836.2133512", "slides": "", "author_site": "Qingyang Tan, Zherong Pan, Breannan Smith, Takaaki Shiratori, Dinesh Manocha", "author": "Qingyang Tan; Zherong Pan; Breannan Smith; Takaaki Shiratori; Dinesh Manocha", "abstract": "We present a robust learning algorithm to detect and handle collisions in 3D deforming meshes. We first train a neural network to detect collisions and then use a numerical optimization algorithm to resolve penetrations guided by the network. Our learned collision handler can resolve collisions for unseen, high-dimensional meshes with thousands of vertices. To obtain stable network performance in such large and unseen spaces, we apply active learning by progressively inserting new collision data based on the network inferences. We automatically label these new data using an analytical collision detector and progressively fine-tune our detection networks. We evaluate our method for collision handling of complex, 3D meshes coming from several datasets with different shapes and topologies, including datasets corresponding to dressed and undressed human poses, cloth simulations, and human hand poses acquired using multi-view capture systems.", "bibtex": "@InProceedings{pmlr-v162-tan22b,\n title = \t {N-Penetrate: Active Learning of Neural Collision Handler for Complex 3{D} Mesh Deformations},\n author = {Tan, Qingyang and Pan, Zherong and Smith, Breannan and Shiratori, Takaaki and Manocha, Dinesh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21037--21049},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tan22b/tan22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/tan22b.html},\n abstract = \t {We present a robust learning algorithm to detect and handle collisions in 3D deforming meshes. We first train a neural network to detect collisions and then use a numerical optimization algorithm to resolve penetrations guided by the network. Our learned collision handler can resolve collisions for unseen, high-dimensional meshes with thousands of vertices. To obtain stable network performance in such large and unseen spaces, we apply active learning by progressively inserting new collision data based on the network inferences. We automatically label these new data using an analytical collision detector and progressively fine-tune our detection networks. We evaluate our method for collision handling of complex, 3D meshes coming from several datasets with different shapes and topologies, including datasets corresponding to dressed and undressed human poses, cloth simulations, and human hand poses acquired using multi-view capture systems.}\n}", "pdf": "https://proceedings.mlr.press/v162/tan22b/tan22b.pdf", "supp": "", "pdf_size": 8789944, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3575124446187059497&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Department of Computer Science, University of Maryland at College Park; Lightspeed & Quantum Studio, Tencent America; Meta Reality Labs Research; Meta Reality Labs Research; Department of Computer Science, University of Maryland at College Park", "aff_domain": "umd.edu; ; ; ; ", "email": "umd.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/tan22b.html", "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "University of Maryland;Tencent;Meta", "aff_unique_dep": "Department of Computer Science;Lightspeed & Quantum Studio;Research", "aff_unique_url": "https://www/umd.edu;https://www.tencent.com/en-us;https://www.meta.com", "aff_unique_abbr": "UMD;Tencent;MRL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "NAFS: A Simple yet Tough-to-beat Baseline for Graph Representation Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17247", "id": "17247", "proceeding": "https://proceedings.mlr.press/v162/zhang22y.html", "poster": "", "slides": "", "author_site": "Wentao Zhang, Zeang Sheng, Mingyu Yang, Yang Li, Yu Shen, Zhi Yang, Bin Cui", "author": "Wentao Zhang; Zeang Sheng; Mingyu Yang; Yang Li; Yu Shen; Zhi Yang; Bin Cui", "abstract": "Recently, graph neural networks (GNNs) have shown prominent performance in graph representation learning by leveraging knowledge from both graph structure and node features. However, most of them have two major limitations. First, GNNs can learn higher-order structural information by stacking more layers but can not deal with large depth due to the over-smoothing issue. Second, it is not easy to apply these methods on large graphs due to the expensive computation cost and high memory usage. In this paper, we present node-adaptive feature smoothing (NAFS), a simple non-parametric method that constructs node representations without parameter learning. NAFS first extracts the features of each node with its neighbors of different hops by feature smoothing, and then adaptively combines the smoothed features. Besides, the constructed node representation can further be enhanced by the ensemble of smoothed features extracted via different smoothing strategies. We conduct experiments on four benchmark datasets on two different application scenarios: node clustering and link prediction. Remarkably, NAFS with feature ensemble outperforms the state-of-the-art GNNs on these tasks and mitigates the aforementioned two limitations of most learning-based GNN counterparts.", "bibtex": "@InProceedings{pmlr-v162-zhang22y,\n title = \t {{NAFS}: A Simple yet Tough-to-beat Baseline for Graph Representation Learning},\n author = {Zhang, Wentao and Sheng, Zeang and Yang, Mingyu and Li, Yang and Shen, Yu and Yang, Zhi and Cui, Bin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26467--26483},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22y/zhang22y.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22y.html},\n abstract = \t {Recently, graph neural networks (GNNs) have shown prominent performance in graph representation learning by leveraging knowledge from both graph structure and node features. However, most of them have two major limitations. First, GNNs can learn higher-order structural information by stacking more layers but can not deal with large depth due to the over-smoothing issue. Second, it is not easy to apply these methods on large graphs due to the expensive computation cost and high memory usage. In this paper, we present node-adaptive feature smoothing (NAFS), a simple non-parametric method that constructs node representations without parameter learning. NAFS first extracts the features of each node with its neighbors of different hops by feature smoothing, and then adaptively combines the smoothed features. Besides, the constructed node representation can further be enhanced by the ensemble of smoothed features extracted via different smoothing strategies. We conduct experiments on four benchmark datasets on two different application scenarios: node clustering and link prediction. Remarkably, NAFS with feature ensemble outperforms the state-of-the-art GNNs on these tasks and mitigates the aforementioned two limitations of most learning-based GNN counterparts.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22y/zhang22y.pdf", "supp": "", "pdf_size": 1853690, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15414509502576388233&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "School of CS & Key Laboratory of High Con\ufb01dence Software Technologies, Peking University; School of CS & Key Laboratory of High Con\ufb01dence Software Technologies, Peking University; School of CS & Key Laboratory of High Con\ufb01dence Software Technologies, Peking University; School of CS & Key Laboratory of High Con\ufb01dence Software Technologies, Peking University; School of CS & Key Laboratory of High Con\ufb01dence Software Technologies, Peking University; School of CS & Key Laboratory of High Con\ufb01dence Software Technologies, Peking University; School of CS & Key Laboratory of High Con\ufb01dence Software Technologies, Peking University + Institute of Computational Social Science, Peking University (Qingdao), China", "aff_domain": "pku.edu.cn; ; ; ; ; ;pku.edu.cn", "email": "pku.edu.cn; ; ; ; ; ;pku.edu.cn", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/zhang22y.html", "aff_unique_index": "0;0;0;0;0;0;0+0", "aff_unique_norm": "Peking University", "aff_unique_dep": "School of CS & Key Laboratory of High Con\ufb01dence Software Technologies", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "PKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Qingdao", "aff_country_unique_index": "0;0;0;0;0;0;0+0", "aff_country_unique": "China" }, { "title": "NISPA: Neuro-Inspired Stability-Plasticity Adaptation for Continual Learning in Sparse Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16095", "id": "16095", "proceeding": "https://proceedings.mlr.press/v162/gurbuz22a.html", "poster": "/media/PosterPDFs/ICML%202022/16fc18d787294ad5171100e33d05d4e2.png?t=1657173386.8563817", "slides": "", "author_site": "Mustafa Burak Gurbuz, Constantine Dovrolis", "author": "Mustafa B Gurbuz; Constantine Dovrolis", "abstract": "The goal of continual learning (CL) is to learn different tasks over time. The main desiderata associated with CL are to maintain performance on older tasks, leverage the latter to improve learning of future tasks, and to introduce minimal overhead in the training process (for instance, to not require a growing model or retraining). We propose the Neuro-Inspired Stability-Plasticity Adaptation (NISPA) architecture that addresses these desiderata through a sparse neural network with fixed density. NISPA forms stable paths to preserve learned knowledge from older tasks. Also, NISPA uses connection rewiring to create new plastic paths that reuse existing knowledge on novel tasks. Our extensive evaluation on EMNIST, FashionMNIST, CIFAR10, and CIFAR100 datasets shows that NISPA significantly outperforms representative state-of-the-art continual learning baselines, and it uses up to ten times fewer learnable parameters compared to baselines. We also make the case that sparsity is an essential ingredient for continual learning. The NISPA code is available at https://github.com/BurakGurbuz97/NISPA.", "bibtex": "@InProceedings{pmlr-v162-gurbuz22a,\n title = \t {{NISPA}: Neuro-Inspired Stability-Plasticity Adaptation for Continual Learning in Sparse Networks},\n author = {Gurbuz, Mustafa B and Dovrolis, Constantine},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8157--8174},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gurbuz22a/gurbuz22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gurbuz22a.html},\n abstract = \t {The goal of continual learning (CL) is to learn different tasks over time. The main desiderata associated with CL are to maintain performance on older tasks, leverage the latter to improve learning of future tasks, and to introduce minimal overhead in the training process (for instance, to not require a growing model or retraining). We propose the Neuro-Inspired Stability-Plasticity Adaptation (NISPA) architecture that addresses these desiderata through a sparse neural network with fixed density. NISPA forms stable paths to preserve learned knowledge from older tasks. Also, NISPA uses connection rewiring to create new plastic paths that reuse existing knowledge on novel tasks. Our extensive evaluation on EMNIST, FashionMNIST, CIFAR10, and CIFAR100 datasets shows that NISPA significantly outperforms representative state-of-the-art continual learning baselines, and it uses up to ten times fewer learnable parameters compared to baselines. We also make the case that sparsity is an essential ingredient for continual learning. The NISPA code is available at https://github.com/BurakGurbuz97/NISPA.}\n}", "pdf": "https://proceedings.mlr.press/v162/gurbuz22a/gurbuz22a.pdf", "supp": "", "pdf_size": 17272994, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17073314745146797398&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "School of Computer Science, Georgia Institute of Technology, USA + KIOS Research and Innovation Center of Excellence, Cyprus; School of Computer Science, Georgia Institute of Technology, USA + KIOS Research and Innovation Center of Excellence, Cyprus", "aff_domain": "gatech.edu;gatech.edu", "email": "gatech.edu;gatech.edu", "github": "https://github.com/BurakGurbuz97/NISPA", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/gurbuz22a.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Georgia Institute of Technology;KIOS Research and Innovation Center of Excellence", "aff_unique_dep": "School of Computer Science;", "aff_unique_url": "https://www.gatech.edu;", "aff_unique_abbr": "Georgia Tech;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Georgia Tech;", "aff_country_unique_index": "0+1;0+1", "aff_country_unique": "United States;Cyprus" }, { "title": "NLP From Scratch Without Large-Scale Pretraining: A Simple and Efficient Framework", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16657", "id": "16657", "proceeding": "https://proceedings.mlr.press/v162/yao22c.html", "poster": "/media/PosterPDFs/ICML%202022/05311655a15b75fab86956663e1819cd_0njXElx.png?t=1656673112.622342", "slides": "", "author_site": "Xingcheng Yao, Yanan Zheng, Xiaocong Yang, Zhilin Yang", "author": "Xingcheng Yao; Yanan Zheng; Xiaocong Yang; Zhilin Yang", "abstract": "Pretrained language models have become the standard approach for many NLP tasks due to strong performance, but they are very expensive to train. We propose a simple and efficient learning framework, TLM, that does not rely on large-scale pretraining. Given some labeled task data and a large general corpus, TLM uses task data as queries to retrieve a tiny subset of the general corpus and jointly optimizes the task objective and the language modeling objective from scratch. On eight classification datasets in four domains, TLM achieves results better than or similar to pretrained language models (e.g., RoBERTa-Large) while reducing the training FLOPs by two orders of magnitude. With high accuracy and efficiency, we hope TLM will contribute to democratizing NLP and expediting its development.", "bibtex": "@InProceedings{pmlr-v162-yao22c,\n title = \t {{NLP} From Scratch Without Large-Scale Pretraining: A Simple and Efficient Framework},\n author = {Yao, Xingcheng and Zheng, Yanan and Yang, Xiaocong and Yang, Zhilin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25438--25451},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yao22c/yao22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/yao22c.html},\n abstract = \t {Pretrained language models have become the standard approach for many NLP tasks due to strong performance, but they are very expensive to train. We propose a simple and efficient learning framework, TLM, that does not rely on large-scale pretraining. Given some labeled task data and a large general corpus, TLM uses task data as queries to retrieve a tiny subset of the general corpus and jointly optimizes the task objective and the language modeling objective from scratch. On eight classification datasets in four domains, TLM achieves results better than or similar to pretrained language models (e.g., RoBERTa-Large) while reducing the training FLOPs by two orders of magnitude. With high accuracy and efficiency, we hope TLM will contribute to democratizing NLP and expediting its development.}\n}", "pdf": "https://proceedings.mlr.press/v162/yao22c/yao22c.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/yao22c-supp.zip", "pdf_size": 2137076, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3254978626719045112&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University; Department of Computer Science and Technology, Tsinghua University; School of Economics and Management, Tsinghua University + Recurrent AI, Inc; Institute for Interdisciplinary Information Sciences, Tsinghua University + Shanghai Qi Zhi Institute", "aff_domain": "tsinghua.edu.cn; ; ; ", "email": "tsinghua.edu.cn; ; ; ", "github": "https://github.com/yaoxingcheng/TLM", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/yao22c.html", "aff_unique_index": "0;0;0+1;0+2", "aff_unique_norm": "Tsinghua University;Recurrent AI;Shanghai Qi Zhi Institute", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;;", "aff_unique_url": "https://www.tsinghua.edu.cn;;https://www.qz.io", "aff_unique_abbr": "Tsinghua;RAI;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0+0", "aff_country_unique": "China;United States" }, { "title": "NOMU: Neural Optimization-based Model Uncertainty", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16225", "id": "16225", "proceeding": "https://proceedings.mlr.press/v162/heiss22a.html", "poster": "/media/PosterPDFs/ICML%202022/71f6278d140af599e06ad9bf1ba03cb0.png?t=1657185977.474555", "slides": "", "author_site": "Jakob Heiss, Jakob Weissteiner, Hanna Wutte, Sven Seuken, Josef Teichmann", "author": "Jakob M Heiss; Jakob Weissteiner; Hanna S Wutte; Sven Seuken; Josef Teichmann", "abstract": "We study methods for estimating model uncertainty for neural networks (NNs) in regression. To isolate the effect of model uncertainty, we focus on a noiseless setting with scarce training data. We introduce five important desiderata regarding model uncertainty that any method should satisfy. However, we find that established benchmarks often fail to reliably capture some of these desiderata, even those that are required by Bayesian theory. To address this, we introduce a new approach for capturing model uncertainty for NNs, which we call Neural Optimization-based Model Uncertainty (NOMU). The main idea of NOMU is to design a network architecture consisting of two connected sub-NNs, one for model prediction and one for model uncertainty, and to train it using a carefully-designed loss function. Importantly, our design enforces that NOMU satisfies our five desiderata. Due to its modular architecture, NOMU can provide model uncertainty for any given (previously trained) NN if given access to its training data. We evaluate NOMU in various regressions tasks and noiseless Bayesian optimization (BO) with costly evaluations. In regression, NOMU performs at least as well as state-of-the-art methods. In BO, NOMU even outperforms all considered benchmarks.", "bibtex": "@InProceedings{pmlr-v162-heiss22a,\n title = \t {{NOMU}: Neural Optimization-based Model Uncertainty},\n author = {Heiss, Jakob M and Weissteiner, Jakob and Wutte, Hanna S and Seuken, Sven and Teichmann, Josef},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8708--8758},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/heiss22a/heiss22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/heiss22a.html},\n abstract = \t {We study methods for estimating model uncertainty for neural networks (NNs) in regression. To isolate the effect of model uncertainty, we focus on a noiseless setting with scarce training data. We introduce five important desiderata regarding model uncertainty that any method should satisfy. However, we find that established benchmarks often fail to reliably capture some of these desiderata, even those that are required by Bayesian theory. To address this, we introduce a new approach for capturing model uncertainty for NNs, which we call Neural Optimization-based Model Uncertainty (NOMU). The main idea of NOMU is to design a network architecture consisting of two connected sub-NNs, one for model prediction and one for model uncertainty, and to train it using a carefully-designed loss function. Importantly, our design enforces that NOMU satisfies our five desiderata. Due to its modular architecture, NOMU can provide model uncertainty for any given (previously trained) NN if given access to its training data. We evaluate NOMU in various regressions tasks and noiseless Bayesian optimization (BO) with costly evaluations. In regression, NOMU performs at least as well as state-of-the-art methods. In BO, NOMU even outperforms all considered benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/heiss22a/heiss22a.pdf", "supp": "", "pdf_size": 10591197, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17483969048738577269&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "ETH Zurich + ETH AI Center; ETH AI Center + University of Zurich; ETH Zurich + ETH AI Center; ETH AI Center + University of Zurich; ETH Zurich + ETH AI Center", "aff_domain": "ifi.uzh.ch; ; ; ; ", "email": "ifi.uzh.ch; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/heiss22a.html", "aff_unique_index": "0+0;0+1;0+0;0+1;0+0", "aff_unique_norm": "ETH Zurich;University of Zurich", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.unizh.ch", "aff_unique_abbr": "ETHZ;UZH", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", "aff_country_unique": "Switzerland" }, { "title": "NP-Match: When Neural Processes meet Semi-Supervised Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18179", "id": "18179", "proceeding": "https://proceedings.mlr.press/v162/wang22s.html", "poster": "/media/PosterPDFs/ICML%202022/069d3bb002acd8d7dd095917f9efe4cb.png?t=1657374224.923614", "slides": "", "author_site": "Jianfeng Wang, Thomas Lukasiewicz, Daniela Massiceti, Xiaolin Hu, Vladimir Pavlovic, Alexandros Neophytou", "author": "Jianfeng Wang; Thomas Lukasiewicz; Daniela Massiceti; Xiaolin Hu; Vladimir Pavlovic; Alexandros Neophytou", "abstract": "Semi-supervised learning (SSL) has been widely explored in recent years, and it is an effective way of leveraging unlabeled data to reduce the reliance on labeled data. In this work, we adjust neural processes (NPs) to the semi-supervised image classification task, resulting in a new method named NP-Match. NP-Match is suited to this task for two reasons. Firstly, NP-Match implicitly compares data points when making predictions, and as a result, the prediction of each unlabeled data point is affected by the labeled data points that are similar to it, which improves the quality of pseudolabels. Secondly, NP-Match is able to estimate uncertainty that can be used as a tool for selecting unlabeled samples with reliable pseudo-labels. Compared with uncertainty-based SSL methods implemented with Monte Carlo (MC) dropout, NP-Match estimates uncertainty with much less computational overhead, which can save time at both the training and the testing phases. We conducted extensive experiments on four public datasets, and NP-Match outperforms state-of-theart (SOTA) results or achieves competitive results on them, which shows the effectiveness of NPMatch and its potential for SSL.", "bibtex": "@InProceedings{pmlr-v162-wang22s,\n title = \t {{NP}-Match: When Neural Processes meet Semi-Supervised Learning},\n author = {Wang, Jianfeng and Lukasiewicz, Thomas and Massiceti, Daniela and Hu, Xiaolin and Pavlovic, Vladimir and Neophytou, Alexandros},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22919--22934},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22s/wang22s.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22s.html},\n abstract = \t {Semi-supervised learning (SSL) has been widely explored in recent years, and it is an effective way of leveraging unlabeled data to reduce the reliance on labeled data. In this work, we adjust neural processes (NPs) to the semi-supervised image classification task, resulting in a new method named NP-Match. NP-Match is suited to this task for two reasons. Firstly, NP-Match implicitly compares data points when making predictions, and as a result, the prediction of each unlabeled data point is affected by the labeled data points that are similar to it, which improves the quality of pseudolabels. Secondly, NP-Match is able to estimate uncertainty that can be used as a tool for selecting unlabeled samples with reliable pseudo-labels. Compared with uncertainty-based SSL methods implemented with Monte Carlo (MC) dropout, NP-Match estimates uncertainty with much less computational overhead, which can save time at both the training and the testing phases. We conducted extensive experiments on four public datasets, and NP-Match outperforms state-of-theart (SOTA) results or achieves competitive results on them, which shows the effectiveness of NPMatch and its potential for SSL.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22s/wang22s.pdf", "supp": "", "pdf_size": 694263, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13863868059773263765&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, University of Oxford, UK; Institute of Logic and Computation, TU Wien, Austria; Microsoft Research, Cambridge, UK; Department of Computer Science and Technology, Tsinghua University, Beijing, China; Department of Computer Science, Rutgers University, New Jersey, USA; Microsoft, Applied Science Group, Reading, UK", "aff_domain": "cs.ox.ac.uk; ; ; ; ; ", "email": "cs.ox.ac.uk; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wang22s.html", "aff_unique_index": "0;1;2;3;4;2", "aff_unique_norm": "University of Oxford;TU Wien;Microsoft;Tsinghua University;Rutgers University", "aff_unique_dep": "Department of Computer Science;Institute of Logic and Computation;Microsoft Research;Department of Computer Science and Technology;Department of Computer Science", "aff_unique_url": "https://www.ox.ac.uk;https://www.tuwien.ac.at;https://www.microsoft.com/en-us/research;https://www.tsinghua.edu.cn;https://www.rutgers.edu", "aff_unique_abbr": "Oxford;TU Wien;MSR;THU;Rutgers", "aff_campus_unique_index": "1;2;3;4", "aff_campus_unique": ";Cambridge;Beijing;New Brunswick;Reading", "aff_country_unique_index": "0;1;0;2;3;0", "aff_country_unique": "United Kingdom;Austria;China;United States" }, { "title": "Near-Exact Recovery for Tomographic Inverse Problems via Deep Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16155", "id": "16155", "proceeding": "https://proceedings.mlr.press/v162/genzel22a.html", "poster": "/media/PosterPDFs/ICML%202022/f316e3fe33f1f754851712c760ab9d48.png?t=1657618831.664118", "slides": "", "author_site": "Martin Genzel, Ingo G\u00fchring, Jan Macdonald, Maximilian M\u00e4rz", "author": "Martin Genzel; Ingo G\u00fchring; Jan Macdonald; Maximilian M\u00e4rz", "abstract": "This work is concerned with the following fundamental question in scientific machine learning: Can deep-learning-based methods solve noise-free inverse problems to near-perfect accuracy? Positive evidence is provided for the first time, focusing on a prototypical computed tomography (CT) setup. We demonstrate that an iterative end-to-end network scheme enables reconstructions close to numerical precision, comparable to classical compressed sensing strategies. Our results build on our winning submission to the recent AAPM DL-Sparse-View CT Challenge. Its goal was to identify the state-of-the-art in solving the sparse-view CT inverse problem with data-driven techniques. A specific difficulty of the challenge setup was that the precise forward model remained unknown to the participants. Therefore, a key feature of our approach was to initially estimate the unknown fanbeam geometry in a data-driven calibration step. Apart from an in-depth analysis of our methodology, we also demonstrate its state-of-the-art performance on the open-access real-world dataset LoDoPaB CT.", "bibtex": "@InProceedings{pmlr-v162-genzel22a,\n title = \t {Near-Exact Recovery for Tomographic Inverse Problems via Deep Learning},\n author = {Genzel, Martin and G{\\\"u}hring, Ingo and Macdonald, Jan and M{\\\"a}rz, Maximilian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7368--7381},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/genzel22a/genzel22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/genzel22a.html},\n abstract = \t {This work is concerned with the following fundamental question in scientific machine learning: Can deep-learning-based methods solve noise-free inverse problems to near-perfect accuracy? Positive evidence is provided for the first time, focusing on a prototypical computed tomography (CT) setup. We demonstrate that an iterative end-to-end network scheme enables reconstructions close to numerical precision, comparable to classical compressed sensing strategies. Our results build on our winning submission to the recent AAPM DL-Sparse-View CT Challenge. Its goal was to identify the state-of-the-art in solving the sparse-view CT inverse problem with data-driven techniques. A specific difficulty of the challenge setup was that the precise forward model remained unknown to the participants. Therefore, a key feature of our approach was to initially estimate the unknown fanbeam geometry in a data-driven calibration step. Apart from an in-depth analysis of our methodology, we also demonstrate its state-of-the-art performance on the open-access real-world dataset LoDoPaB CT.}\n}", "pdf": "https://proceedings.mlr.press/v162/genzel22a/genzel22a.pdf", "supp": "", "pdf_size": 2627366, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10012619344494620426&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Helmholtz-Zentrum Berlin f \u00fcr Materialien und Energie, Germany + Utrecht University, Netherlands; Technical University Berlin, Germany; Technical University Berlin, Germany; Technical University Berlin, Germany", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/genzel22a.html", "aff_unique_index": "0+1;2;2;2", "aff_unique_norm": "Helmholtz-Zentrum Berlin f\u00fcr Materialien und Energie;Utrecht University;Technical University Berlin", "aff_unique_dep": ";;", "aff_unique_url": "https://www.helmholtz-berlin.de;https://www.uu.nl;https://www.tu-berlin.de", "aff_unique_abbr": "HZB;UU;TUB", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;0", "aff_country_unique": "Germany;Netherlands" }, { "title": "Near-Optimal Algorithms for Autonomous Exploration and Multi-Goal Stochastic Shortest Path", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18095", "id": "18095", "proceeding": "https://proceedings.mlr.press/v162/cai22a.html", "poster": "/media/PosterPDFs/ICML%202022/cfe795a0a3c7bc1683f2efd8837dde0c.png?t=1657435331.0875492", "slides": "", "author_site": "Haoyuan Cai, Tengyu Ma, Simon Du", "author": "Haoyuan Cai; Tengyu Ma; Simon Du", "abstract": "We revisit the incremental autonomous exploration problem proposed by Lim and Auer (2012). In this setting, the agent aims to learn a set of near-optimal goal-conditioned policies to reach the $L$-controllable states: states that are incrementally reachable from an initial state $s_0$ within $L$ steps in expectation. We introduce a new algorithm with stronger sample complexity bounds than existing ones. Furthermore, we also prove the first lower bound for the autonomous exploration problem. In particular, the lower bound implies that our proposed algorithm, Value-Aware Autonomous Exploration, is nearly minimax-optimal when the number of $L$-controllable states grows polynomially with respect to $L$. Key in our algorithm design is a connection between autonomous exploration and multi-goal stochastic shortest path, a new problem that naturally generalizes the classical stochastic shortest path problem. This new problem and its connection to autonomous exploration can be of independent interest.", "bibtex": "@InProceedings{pmlr-v162-cai22a,\n title = \t {Near-Optimal Algorithms for Autonomous Exploration and Multi-Goal Stochastic Shortest Path},\n author = {Cai, Haoyuan and Ma, Tengyu and Du, Simon},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2434--2456},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cai22a/cai22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/cai22a.html},\n abstract = \t {We revisit the incremental autonomous exploration problem proposed by Lim and Auer (2012). In this setting, the agent aims to learn a set of near-optimal goal-conditioned policies to reach the $L$-controllable states: states that are incrementally reachable from an initial state $s_0$ within $L$ steps in expectation. We introduce a new algorithm with stronger sample complexity bounds than existing ones. Furthermore, we also prove the first lower bound for the autonomous exploration problem. In particular, the lower bound implies that our proposed algorithm, Value-Aware Autonomous Exploration, is nearly minimax-optimal when the number of $L$-controllable states grows polynomially with respect to $L$. Key in our algorithm design is a connection between autonomous exploration and multi-goal stochastic shortest path, a new problem that naturally generalizes the classical stochastic shortest path problem. This new problem and its connection to autonomous exploration can be of independent interest.}\n}", "pdf": "https://proceedings.mlr.press/v162/cai22a/cai22a.pdf", "supp": "", "pdf_size": 522295, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=890609703872839864&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Tsinghua University; Stanford University; University of Washington", "aff_domain": "gmail.com;stanford.edu;cs.washington.edu", "email": "gmail.com;stanford.edu;cs.washington.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/cai22a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Tsinghua University;Stanford University;University of Washington", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.stanford.edu;https://www.washington.edu", "aff_unique_abbr": "THU;Stanford;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "title": "Near-Optimal Learning of Extensive-Form Games with Imperfect Information", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17471", "id": "17471", "proceeding": "https://proceedings.mlr.press/v162/bai22b.html", "poster": "/media/PosterPDFs/ICML%202022/d8bf84be3800d12f74d8b05e9b89836f.png?t=1656558662.944097", "slides": "", "author_site": "Yu Bai, Chi Jin, Song Mei, Tiancheng Yu", "author": "Yu Bai; Chi Jin; Song Mei; Tiancheng Yu", "abstract": "This paper resolves the open question of designing near-optimal algorithms for learning imperfect-information extensive-form games from bandit feedback. We present the first line of algorithms that require only $\\widetilde{\\mathcal{O}}((XA+YB)/\\varepsilon^2)$ episodes of play to find an $\\varepsilon$-approximate Nash equilibrium in two-player zero-sum games, where $X,Y$ are the number of information sets and $A,B$ are the number of actions for the two players. This improves upon the best known sample complexity of $\\widetilde{\\mathcal{O}}((X^2A+Y^2B)/\\varepsilon^2)$ by a factor of $\\widetilde{\\mathcal{O}}(\\max\\{X, Y\\})$, and matches the information-theoretic lower bound up to logarithmic factors. We achieve this sample complexity by two new algorithms: Balanced Online Mirror Descent, and Balanced Counterfactual Regret Minimization. Both algorithms rely on novel approaches of integrating", "bibtex": "@InProceedings{pmlr-v162-bai22b,\n title = \t {Near-Optimal Learning of Extensive-Form Games with Imperfect Information},\n author = {Bai, Yu and Jin, Chi and Mei, Song and Yu, Tiancheng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1337--1382},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bai22b/bai22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/bai22b.html},\n abstract = \t {This paper resolves the open question of designing near-optimal algorithms for learning imperfect-information extensive-form games from bandit feedback. We present the first line of algorithms that require only $\\widetilde{\\mathcal{O}}((XA+YB)/\\varepsilon^2)$ episodes of play to find an $\\varepsilon$-approximate Nash equilibrium in two-player zero-sum games, where $X,Y$ are the number of information sets and $A,B$ are the number of actions for the two players. This improves upon the best known sample complexity of $\\widetilde{\\mathcal{O}}((X^2A+Y^2B)/\\varepsilon^2)$ by a factor of $\\widetilde{\\mathcal{O}}(\\max\\{X, Y\\})$, and matches the information-theoretic lower bound up to logarithmic factors. We achieve this sample complexity by two new algorithms: Balanced Online Mirror Descent, and Balanced Counterfactual Regret Minimization. Both algorithms rely on novel approaches of integrating", "pdf": "https://proceedings.mlr.press/v162/bai22b/bai22b.pdf", "supp": "", "pdf_size": 609032, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7770540927858917439&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Salesforce Research; Princeton University; UC Berkeley; MIT", "aff_domain": "salesforce.com; ; ; ", "email": "salesforce.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/bai22b.html", "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Salesforce;Princeton University;University of California, Berkeley;Massachusetts Institute of Technology", "aff_unique_dep": "Salesforce Research;;;", "aff_unique_url": "https://research.salesforce.com;https://www.princeton.edu;https://www.berkeley.edu;https://web.mit.edu", "aff_unique_abbr": "Salesforce;Princeton;UC Berkeley;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Near-optimal rate of consistency for linear models with missing values", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17749", "id": "17749", "proceeding": "https://proceedings.mlr.press/v162/ayme22a.html", "poster": "/media/PosterPDFs/ICML%202022/d1c373ab1570cfb9a7dbb53c186b37a2.png?t=1657692851.958765", "slides": "", "author_site": "Alexis Ayme, Claire Boyer, Aymeric Dieuleveut, Erwan Scornet", "author": "Alexis Ayme; Claire Boyer; Aymeric Dieuleveut; Erwan Scornet", "abstract": "Missing values arise in most real-world data sets due to the aggregation of multiple sources and intrinsically missing information (sensor failure, unanswered questions in surveys...). In fact, the very nature of missing values usually prevents us from running standard learning algorithms. In this paper, we focus on the extensively-studied linear models, but in presence of missing values, which turns out to be quite a challenging task. Indeed, the Bayes predictor can be decomposed as a sum of predictors corresponding to each missing pattern. This eventually requires to solve a number of learning tasks, exponential in the number of input features, which makes predictions impossible for current real-world datasets. First, we propose a rigorous setting to analyze a least-square type estimator and establish a bound on the excess risk which increases exponentially in the dimension. Consequently, we leverage the missing data distribution to propose a new algorithm, and derive associated adaptive risk bounds that turn out to be minimax optimal. Numerical experiments highlight the benefits of our method compared to state-of-the-art algorithms used for predictions with missing values.", "bibtex": "@InProceedings{pmlr-v162-ayme22a,\n title = \t {Near-optimal rate of consistency for linear models with missing values},\n author = {Ayme, Alexis and Boyer, Claire and Dieuleveut, Aymeric and Scornet, Erwan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1211--1243},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ayme22a/ayme22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ayme22a.html},\n abstract = \t {Missing values arise in most real-world data sets due to the aggregation of multiple sources and intrinsically missing information (sensor failure, unanswered questions in surveys...). In fact, the very nature of missing values usually prevents us from running standard learning algorithms. In this paper, we focus on the extensively-studied linear models, but in presence of missing values, which turns out to be quite a challenging task. Indeed, the Bayes predictor can be decomposed as a sum of predictors corresponding to each missing pattern. This eventually requires to solve a number of learning tasks, exponential in the number of input features, which makes predictions impossible for current real-world datasets. First, we propose a rigorous setting to analyze a least-square type estimator and establish a bound on the excess risk which increases exponentially in the dimension. Consequently, we leverage the missing data distribution to propose a new algorithm, and derive associated adaptive risk bounds that turn out to be minimax optimal. Numerical experiments highlight the benefits of our method compared to state-of-the-art algorithms used for predictions with missing values.}\n}", "pdf": "https://proceedings.mlr.press/v162/ayme22a/ayme22a.pdf", "supp": "", "pdf_size": 588813, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2259300164221373158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Sorbonne Universit\u00e9, CNRS, Laboratoire de Probabilit\u00e9s, Statistique et Mod\u00e9lisation (LPSM), F-75005 Paris, France; Sorbonne Universit\u00e9, CNRS, Laboratoire de Probabilit\u00e9s, Statistique et Mod\u00e9lisation (LPSM), F-75005 Paris, France + MOKAPLAN, INRIA Paris + CMAP, UMR7641, Ecole Polytechnique, IP Paris, 91128 Palaiseau, France; CMAP, UMR7641, Ecole Polytechnique, IP Paris, 91128 Palaiseau, France; CMAP, UMR7641, Ecole Polytechnique, IP Paris, 91128 Palaiseau, France", "aff_domain": "sorbonne-universite.fr; ; ; ", "email": "sorbonne-universite.fr; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/ayme22a.html", "aff_unique_index": "0;0+1+2;2;2", "aff_unique_norm": "Sorbonne Universit\u00e9;INRIA;Ecole Polytechnique", "aff_unique_dep": "Laboratoire de Probabilit\u00e9s, Statistique et Mod\u00e9lisation (LPSM);;CMAP, UMR7641", "aff_unique_url": "https://www.sorbonne-universite.fr;https://www.inria.fr;https://www.ec-polytechnique.fr", "aff_unique_abbr": "Sorbonne U;INRIA;Polytechnique", "aff_campus_unique_index": "0;0+0+1;1;1", "aff_campus_unique": "Paris;Palaiseau", "aff_country_unique_index": "0;0+0+0;0;0", "aff_country_unique": "France" }, { "title": "Nearly Minimax Optimal Reinforcement Learning with Linear Function Approximation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17295", "id": "17295", "proceeding": "https://proceedings.mlr.press/v162/hu22a.html", "poster": "/media/PosterPDFs/ICML%202022/975e6107778ce7a40b9878bfb96a16a7_IFuSx4l.png?t=1657900196.5751183", "slides": "", "author_site": "Pihe Hu, Yu Chen, Longbo Huang", "author": "Pihe Hu; Yu Chen; Longbo Huang", "abstract": "We study reinforcement learning with linear function approximation where the transition probability and reward functions are linear with respect to a feature mapping $\\boldsymbol{\\phi}(s,a)$. Specifically, we consider the episodic inhomogeneous linear Markov Decision Process (MDP), and propose a novel computation-efficient algorithm, LSVI-UCB$^+$, which achieves an $\\widetilde{O}(Hd\\sqrt{T})$ regret bound where $H$ is the episode length, $d$ is the feature dimension, and $T$ is the number of steps. LSVI-UCB$^+$ builds on weighted ridge regression and upper confidence value iteration with a Bernstein-type exploration bonus. Our statistical results are obtained with novel analytical tools, including a new Bernstein self-normalized bound with conservatism on elliptical potentials, and refined analysis of the correction term. To the best of our knowledge, this is the first minimax optimal algorithm for linear MDPs up to logarithmic factors, which closes the $\\sqrt{Hd}$ gap between the best known upper bound of $\\widetilde{O}(\\sqrt{H^3d^3T})$ in \\cite{jin2020provably} and lower bound of $\\Omega(Hd\\sqrt{T})$ for linear MDPs.", "bibtex": "@InProceedings{pmlr-v162-hu22a,\n title = \t {Nearly Minimax Optimal Reinforcement Learning with Linear Function Approximation},\n author = {Hu, Pihe and Chen, Yu and Huang, Longbo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8971--9019},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hu22a/hu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hu22a.html},\n abstract = \t {We study reinforcement learning with linear function approximation where the transition probability and reward functions are linear with respect to a feature mapping $\\boldsymbol{\\phi}(s,a)$. Specifically, we consider the episodic inhomogeneous linear Markov Decision Process (MDP), and propose a novel computation-efficient algorithm, LSVI-UCB$^+$, which achieves an $\\widetilde{O}(Hd\\sqrt{T})$ regret bound where $H$ is the episode length, $d$ is the feature dimension, and $T$ is the number of steps. LSVI-UCB$^+$ builds on weighted ridge regression and upper confidence value iteration with a Bernstein-type exploration bonus. Our statistical results are obtained with novel analytical tools, including a new Bernstein self-normalized bound with conservatism on elliptical potentials, and refined analysis of the correction term. To the best of our knowledge, this is the first minimax optimal algorithm for linear MDPs up to logarithmic factors, which closes the $\\sqrt{Hd}$ gap between the best known upper bound of $\\widetilde{O}(\\sqrt{H^3d^3T})$ in \\cite{jin2020provably} and lower bound of $\\Omega(Hd\\sqrt{T})$ for linear MDPs.}\n}", "pdf": "https://proceedings.mlr.press/v162/hu22a/hu22a.pdf", "supp": "", "pdf_size": 686201, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17976390376770721090&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, China; Department of Mathematical Sciences, Tsinghua University, Beijing, China; Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, China", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/hu22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "Tsinghua", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Nearly Optimal Catoni\u2019s M-estimator for Infinite Variance", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16205", "id": "16205", "proceeding": "https://proceedings.mlr.press/v162/bhatt22b.html", "poster": "/media/PosterPDFs/ICML%202022/77c67132097f9b1ff028aed0eca8d21b.png?t=1657852184.1372464", "slides": "/media/icml-2022/Slides/16205.pdf", "author_site": "Sujay Bhatt, Guanhua Fang, Ping Li, Gennady Samorodnitsky", "author": "Sujay Bhatt; Guanhua Fang; Ping Li; Gennady Samorodnitsky", "abstract": "In this paper, we extend the remarkable M-estimator of Catoni\u00a0\\citep{Cat12} to situations where the variance is infinite. In particular, given a sequence of i.i.d random variables\u00a0$\\{X_i\\}_{i=1}^n$ from distribution\u00a0$\\mathcal{D}$ over\u00a0$\\mathbb{R}$ with mean\u00a0$\\mu$, we only assume the existence of a known upper bound\u00a0$\\upsilon_{\\varepsilon} > 0$ on the\u00a0$(1+\\varepsilon)^{th}$ central moment of the random variables, namely, for\u00a0$\\varepsilon \\in (0,1]$ \\[ \\mathbb{E}_{X_1 \\sim \\mathcal{D}} \\Big| X_1 - \\mu \\Big|^{1+\\varepsilon} \\leq \\upsilon_{\\varepsilon}. \\]{The} extension is non-trivial owing to the difficulty in characterizing the roots of certain polynomials of degree smaller than\u00a0$2$. The proposed estimator has the same order of magnitude and the same asymptotic constant as in\u00a0\\citet{Cat12}, but for the case of bounded moments. We further propose a version of the estimator that does not require even the knowledge of\u00a0$\\upsilon_{\\varepsilon}$, but adapts the moment bound in a data-driven manner. Finally, to illustrate the usefulness of the derived non-asymptotic confidence bounds, we consider an application in multi-armed bandits and propose best arm identification algorithms, in the fixed confidence setting, that outperform the state of the art.", "bibtex": "@InProceedings{pmlr-v162-bhatt22b,\n title = \t {Nearly Optimal Catoni\u2019s M-estimator for Infinite Variance},\n author = {Bhatt, Sujay and Fang, Guanhua and Li, Ping and Samorodnitsky, Gennady},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1925--1944},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bhatt22b/bhatt22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/bhatt22b.html},\n abstract = \t {In this paper, we extend the remarkable M-estimator of Catoni\u00a0\\citep{Cat12} to situations where the variance is infinite. In particular, given a sequence of i.i.d random variables\u00a0$\\{X_i\\}_{i=1}^n$ from distribution\u00a0$\\mathcal{D}$ over\u00a0$\\mathbb{R}$ with mean\u00a0$\\mu$, we only assume the existence of a known upper bound\u00a0$\\upsilon_{\\varepsilon} > 0$ on the\u00a0$(1+\\varepsilon)^{th}$ central moment of the random variables, namely, for\u00a0$\\varepsilon \\in (0,1]$ \\[ \\mathbb{E}_{X_1 \\sim \\mathcal{D}} \\Big| X_1 - \\mu \\Big|^{1+\\varepsilon} \\leq \\upsilon_{\\varepsilon}. \\]{The} extension is non-trivial owing to the difficulty in characterizing the roots of certain polynomials of degree smaller than\u00a0$2$. The proposed estimator has the same order of magnitude and the same asymptotic constant as in\u00a0\\citet{Cat12}, but for the case of bounded moments. We further propose a version of the estimator that does not require even the knowledge of\u00a0$\\upsilon_{\\varepsilon}$, but adapts the moment bound in a data-driven manner. Finally, to illustrate the usefulness of the derived non-asymptotic confidence bounds, we consider an application in multi-armed bandits and propose best arm identification algorithms, in the fixed confidence setting, that outperform the state of the art.}\n}", "pdf": "https://proceedings.mlr.press/v162/bhatt22b/bhatt22b.pdf", "supp": "", "pdf_size": 536712, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13337403480544830707&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Cognitive Computing Lab; Baidu Research; School of ORIE; Cornell University", "aff_domain": "gmail.com;gmail.com;gmail.com;cornell.edu", "email": "gmail.com;gmail.com;gmail.com;cornell.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/bhatt22b.html", "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Cognitive Computing Lab;Baidu;Cornell University", "aff_unique_dep": "Cognitive Computing;Baidu Research;School of Operations Research and Information Engineering", "aff_unique_url": ";https://research.baidu.com;https://orie.cornell.edu", "aff_unique_abbr": ";Baidu;ORIE", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2;2", "aff_country_unique": ";China;United States" }, { "title": "Nearly Optimal Policy Optimization with Stable at Any Time Guarantee", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16387", "id": "16387", "proceeding": "https://proceedings.mlr.press/v162/wu22n.html", "poster": "/media/PosterPDFs/ICML%202022/b1a59b315fc9a3002ce38bbe070ec3f5.png?t=1657510808.1565394", "slides": "", "author_site": "Tianhao Wu, Yunchang Yang, Han Zhong, Liwei Wang, Simon Du, Jiantao Jiao", "author": "Tianhao Wu; Yunchang Yang; Han Zhong; Liwei Wang; Simon Du; Jiantao Jiao", "abstract": "Policy optimization methods are one of the most widely used classes of Reinforcement Learning (RL) algorithms. However, theoretical understanding of these methods remains insufficient. Even in the episodic (time-inhomogeneous) tabular setting, the state-of-the-art theoretical result of policy-based method in Shani et al. (2020) is only $\\tilde{O}(\\sqrt{S^2AH^4K})$ where $S$ is the number of states, $A$ is the number of actions, $H$ is the horizon, and $K$ is the number of episodes, and there is a $\\sqrt{SH}$ gap compared with the information theoretic lower bound $\\tilde{\\Omega}(\\sqrt{SAH^3K})$ (Jin et al., 2018). To bridge such a gap, we propose a novel algorithm Reference-based Policy Optimization with Stable at Any Time guarantee (RPO-SAT), which features the property \u201cStable at Any Time\u201d. We prove that our algorithm achieves $\\tilde{O}(\\sqrt{SAH^3K} + \\sqrt{AH^4K})$ regret. When $S > H$, our algorithm is minimax optimal when ignoring logarithmic factors. To our best knowledge, RPO-SAT is the first computationally efficient, nearly minimax optimal policy-based algorithm for tabular RL.", "bibtex": "@InProceedings{pmlr-v162-wu22n,\n title = \t {Nearly Optimal Policy Optimization with Stable at Any Time Guarantee},\n author = {Wu, Tianhao and Yang, Yunchang and Zhong, Han and Wang, Liwei and Du, Simon and Jiao, Jiantao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24243--24265},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22n/wu22n.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22n.html},\n abstract = \t {Policy optimization methods are one of the most widely used classes of Reinforcement Learning (RL) algorithms. However, theoretical understanding of these methods remains insufficient. Even in the episodic (time-inhomogeneous) tabular setting, the state-of-the-art theoretical result of policy-based method in Shani et al. (2020) is only $\\tilde{O}(\\sqrt{S^2AH^4K})$ where $S$ is the number of states, $A$ is the number of actions, $H$ is the horizon, and $K$ is the number of episodes, and there is a $\\sqrt{SH}$ gap compared with the information theoretic lower bound $\\tilde{\\Omega}(\\sqrt{SAH^3K})$ (Jin et al., 2018). To bridge such a gap, we propose a novel algorithm Reference-based Policy Optimization with Stable at Any Time guarantee (RPO-SAT), which features the property \u201cStable at Any Time\u201d. We prove that our algorithm achieves $\\tilde{O}(\\sqrt{SAH^3K} + \\sqrt{AH^4K})$ regret. When $S > H$, our algorithm is minimax optimal when ignoring logarithmic factors. To our best knowledge, RPO-SAT is the first computationally efficient, nearly minimax optimal policy-based algorithm for tabular RL.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22n/wu22n.pdf", "supp": "", "pdf_size": 395793, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2765883909114402850&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wu22n.html" }, { "title": "Nested Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17439", "id": "17439", "proceeding": "https://proceedings.mlr.press/v162/martin22a.html", "poster": "/media/PosterPDFs/ICML%202022/05a624166c8eb8273b8464e8d9cb5bd9_FqLDnGE.png?t=1657962669.0993438", "slides": "/media/icml-2022/Slides/17439.pdf", "author_site": "Matthieu Martin, Panayotis Mertikopoulos, Thibaud J Rahier, Houssam Zenati", "author": "Matthieu Martin; Panayotis Mertikopoulos; Thibaud Rahier; Houssam Zenati", "abstract": "In many online decision processes, the optimizing agent is called to choose between large numbers of alternatives with many inherent similarities; in turn, these similarities imply closely correlated losses that may confound standard discrete choice models and bandit algorithms. We study this question in the context of nested bandits, a class of adversarial multi-armed bandit problems where the learner seeks to minimize their regret in the presence of a large number of distinct alternatives with a hierarchy of embedded (non-combinatorial) similarities. In this setting, optimal algorithms based on the exponential weights blueprint (like Hedge, EXP3, and their variants) may incur significant regret because they tend to spend excessive amounts of time exploring irrelevant alternatives with similar, suboptimal costs. To account for this, we propose a nested exponential weights (NEW) algorithm that performs a layered exploration of the learner\u2019s set of alternatives based on a nested, step-by-step selection method. In so doing, we obtain a series of tight bounds for the learner\u2019s regret showing that online learning problems with a high degree of similarity between alternatives can be resolved efficiently, without a red bus / blue bus paradox occurring.", "bibtex": "@InProceedings{pmlr-v162-martin22a,\n title = \t {Nested Bandits},\n author = {Martin, Matthieu and Mertikopoulos, Panayotis and Rahier, Thibaud and Zenati, Houssam},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15093--15121},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/martin22a/martin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/martin22a.html},\n abstract = \t {In many online decision processes, the optimizing agent is called to choose between large numbers of alternatives with many inherent similarities; in turn, these similarities imply closely correlated losses that may confound standard discrete choice models and bandit algorithms. We study this question in the context of nested bandits, a class of adversarial multi-armed bandit problems where the learner seeks to minimize their regret in the presence of a large number of distinct alternatives with a hierarchy of embedded (non-combinatorial) similarities. In this setting, optimal algorithms based on the exponential weights blueprint (like Hedge, EXP3, and their variants) may incur significant regret because they tend to spend excessive amounts of time exploring irrelevant alternatives with similar, suboptimal costs. To account for this, we propose a nested exponential weights (NEW) algorithm that performs a layered exploration of the learner\u2019s set of alternatives based on a nested, step-by-step selection method. In so doing, we obtain a series of tight bounds for the learner\u2019s regret showing that online learning problems with a high degree of similarity between alternatives can be resolved efficiently, without a red bus / blue bus paradox occurring.}\n}", "pdf": "https://proceedings.mlr.press/v162/martin22a/martin22a.pdf", "supp": "", "pdf_size": 688822, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18123483648309549586&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Criteo AI Lab; Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LIG, 38000 Grenoble, France; Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LIG, 38000 Grenoble, France; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France", "aff_domain": "criteo.com;imag.fr;imag.fr;imag.fr", "email": "criteo.com;imag.fr;imag.fr;imag.fr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/martin22a.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Criteo;Universite Grenoble Alpes", "aff_unique_dep": "Criteo AI Lab;", "aff_unique_url": "https://www.criteo.com;https://www.univ-grenoble-alpes.fr", "aff_unique_abbr": "Criteo;UGA", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Grenoble", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "Nesterov Accelerated Shuffling Gradient Method for Convex Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16393", "id": "16393", "proceeding": "https://proceedings.mlr.press/v162/tran22a.html", "poster": "/media/PosterPDFs/ICML%202022/b9228e0962a78b84f3d5d92f4faa000b.png?t=1657483610.9510756", "slides": "", "author_site": "Trang Tran, Katya Scheinberg, Lam Nguyen", "author": "Trang H Tran; Katya Scheinberg; Lam M Nguyen", "abstract": "In this paper, we propose Nesterov Accelerated Shuffling Gradient (NASG), a new algorithm for the convex finite-sum minimization problems. Our method integrates the traditional Nesterov\u2019s acceleration momentum with different shuffling sampling schemes. We show that our algorithm has an improved rate of $\\Ocal(1/T)$ using unified shuffling schemes, where $T$ is the number of epochs. This rate is better than that of any other shuffling gradient methods in convex regime. Our convergence analysis does not require an assumption on bounded domain or a bounded gradient condition. For randomized shuffling schemes, we improve the convergence bound further. When employing some initial condition, we show that our method converges faster near the small neighborhood of the solution. Numerical simulations demonstrate the efficiency of our algorithm.", "bibtex": "@InProceedings{pmlr-v162-tran22a,\n title = \t {{N}esterov Accelerated Shuffling Gradient Method for Convex Optimization},\n author = {Tran, Trang H and Scheinberg, Katya and Nguyen, Lam M},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21703--21732},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tran22a/tran22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tran22a.html},\n abstract = \t {In this paper, we propose Nesterov Accelerated Shuffling Gradient (NASG), a new algorithm for the convex finite-sum minimization problems. Our method integrates the traditional Nesterov\u2019s acceleration momentum with different shuffling sampling schemes. We show that our algorithm has an improved rate of $\\Ocal(1/T)$ using unified shuffling schemes, where $T$ is the number of epochs. This rate is better than that of any other shuffling gradient methods in convex regime. Our convergence analysis does not require an assumption on bounded domain or a bounded gradient condition. For randomized shuffling schemes, we improve the convergence bound further. When employing some initial condition, we show that our method converges faster near the small neighborhood of the solution. Numerical simulations demonstrate the efficiency of our algorithm.}\n}", "pdf": "https://proceedings.mlr.press/v162/tran22a/tran22a.pdf", "supp": "", "pdf_size": 1410337, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14735125807077653853&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "School of Operations Research and Information Engineering, Cornell University, Ithaca, NY, USA; School of Operations Research and Information Engineering, Cornell University, Ithaca, NY, USA; IBM Research, Thomas J. Watson Research Center, Yorktown Heights, NY, USA", "aff_domain": "ibm.com; ;ibm.com", "email": "ibm.com; ;ibm.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/tran22a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Cornell University;IBM", "aff_unique_dep": "School of Operations Research and Information Engineering;Thomas J. Watson Research Center", "aff_unique_url": "https://www.cornell.edu;https://www.ibm.com/research", "aff_unique_abbr": "Cornell;IBM", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Ithaca;Yorktown Heights", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Neural Fisher Discriminant Analysis: Optimal Neural Network Embeddings in Polynomial Time", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18013", "id": "18013", "proceeding": "https://proceedings.mlr.press/v162/bartan22a.html", "poster": "/media/PosterPDFs/ICML%202022/aecad42329922dfc97eee948606e1f8e_rPBxFf0.png?t=1657861838.8410408", "slides": "", "author_site": "Burak Bartan, Mert Pilanci", "author": "Burak Bartan; Mert Pilanci", "abstract": "Fisher\u2019s Linear Discriminant Analysis (FLDA) is a statistical analysis method that linearly embeds data points to a lower dimensional space to maximize a discrimination criterion such that the variance between classes is maximized while the variance within classes is minimized. We introduce a natural extension of FLDA that employs neural networks, called Neural Fisher Discriminant Analysis (NFDA). This method finds the optimal two-layer neural network that embeds data points to optimize the same discrimination criterion. We use tools from convex optimization to transform the optimal neural network embedding problem into a convex problem. The resulting problem is easy to interpret and solve to global optimality. We evaluate the method\u2019s performance on synthetic and real datasets.", "bibtex": "@InProceedings{pmlr-v162-bartan22a,\n title = \t {Neural {F}isher Discriminant Analysis: Optimal Neural Network Embeddings in Polynomial Time},\n author = {Bartan, Burak and Pilanci, Mert},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1647--1663},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bartan22a/bartan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bartan22a.html},\n abstract = \t {Fisher\u2019s Linear Discriminant Analysis (FLDA) is a statistical analysis method that linearly embeds data points to a lower dimensional space to maximize a discrimination criterion such that the variance between classes is maximized while the variance within classes is minimized. We introduce a natural extension of FLDA that employs neural networks, called Neural Fisher Discriminant Analysis (NFDA). This method finds the optimal two-layer neural network that embeds data points to optimize the same discrimination criterion. We use tools from convex optimization to transform the optimal neural network embedding problem into a convex problem. The resulting problem is easy to interpret and solve to global optimality. We evaluate the method\u2019s performance on synthetic and real datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/bartan22a/bartan22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/bartan22a-supp.zip", "pdf_size": 2043494, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13437409388274395665&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Electrical Engineering, Stanford University, CA, USA; Department of Electrical Engineering, Stanford University, CA, USA", "aff_domain": "stanford.edu;stanford.edu", "email": "stanford.edu;stanford.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/bartan22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Neural Implicit Dictionary Learning via Mixture-of-Expert Training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18167", "id": "18167", "proceeding": "https://proceedings.mlr.press/v162/wang22d.html", "poster": "/media/PosterPDFs/ICML%202022/b53477c2821c1bf0da5d40e57b870d35.png?t=1657256409.6100144", "slides": "", "author_site": "Peihao Wang, Zhiwen Fan, Tianlong Chen, Zhangyang \u201cAtlas\u201d Wang", "author": "Peihao Wang; Zhiwen Fan; Tianlong Chen; Zhangyang Wang", "abstract": "Representing visual signals by coordinate-based deep fully-connected networks has been shown advantageous in fitting complex details and solving inverse problems than discrete grid-based representation. However, acquiring such a continuous Implicit Neural Representation (INR) requires tedious per-scene training on tons of signal measurements, which limits its practicality. In this paper, we present a generic INR framework that achieves both data and training efficiency by learning a Neural Implicit Dictionary (NID) from a data collection and representing INR as a functional combination of wavelets sampled from the dictionary. Our NID assembles a group of coordinate-based subnetworks which are tuned to span the desired function space. After training, one can instantly and robustly acquire an unseen scene representation by solving the coding coefficients. To parallelly optimize a large group of networks, we borrow the idea from Mixture-of-Expert (MoE) to design and train our network with a sparse gating mechanism. Our experiments show that, NID can improve reconstruction of 2D images or 3D scenes by 2 orders of magnitude faster with up to 98% less input data. We further demonstrate various applications of NID in image inpainting and occlusion removal, which are considered to be challenging with vanilla INR. Our codes are available in https://github.com/VITA-Group/Neural-Implicit-Dict.", "bibtex": "@InProceedings{pmlr-v162-wang22d,\n title = \t {Neural Implicit Dictionary Learning via Mixture-of-Expert Training},\n author = {Wang, Peihao and Fan, Zhiwen and Chen, Tianlong and Wang, Zhangyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22613--22624},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22d/wang22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22d.html},\n abstract = \t {Representing visual signals by coordinate-based deep fully-connected networks has been shown advantageous in fitting complex details and solving inverse problems than discrete grid-based representation. However, acquiring such a continuous Implicit Neural Representation (INR) requires tedious per-scene training on tons of signal measurements, which limits its practicality. In this paper, we present a generic INR framework that achieves both data and training efficiency by learning a Neural Implicit Dictionary (NID) from a data collection and representing INR as a functional combination of wavelets sampled from the dictionary. Our NID assembles a group of coordinate-based subnetworks which are tuned to span the desired function space. After training, one can instantly and robustly acquire an unseen scene representation by solving the coding coefficients. To parallelly optimize a large group of networks, we borrow the idea from Mixture-of-Expert (MoE) to design and train our network with a sparse gating mechanism. Our experiments show that, NID can improve reconstruction of 2D images or 3D scenes by 2 orders of magnitude faster with up to 98% less input data. We further demonstrate various applications of NID in image inpainting and occlusion removal, which are considered to be challenging with vanilla INR. Our codes are available in https://github.com/VITA-Group/Neural-Implicit-Dict.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22d/wang22d.pdf", "supp": "", "pdf_size": 4097040, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3216325307619643865&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Electrical and Computer Engineering, University of Texas at Austin; Department of Electrical and Computer Engineering, University of Texas at Austin; Department of Electrical and Computer Engineering, University of Texas at Austin; Department of Electrical and Computer Engineering, University of Texas at Austin", "aff_domain": "utexas.edu;utexas.edu;utexas.edu;utexas.edu", "email": "utexas.edu;utexas.edu;utexas.edu;utexas.edu", "github": "https://github.com/VITA-Group/Neural-Implicit-Dict", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/wang22d.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Neural Inverse Kinematic", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18333", "id": "18333", "proceeding": "https://proceedings.mlr.press/v162/bensadoun22a.html", "poster": "/media/PosterPDFs/ICML%202022/83a100ec3c2c30751156cea2d60aacbe.png?t=1657628223.2810557", "slides": "", "author_site": "Raphael Bensadoun, Shir Gur, Nitsan Blau, Lior Wolf", "author": "Raphael Bensadoun; Shir Gur; Nitsan Blau; Lior Wolf", "abstract": "Inverse kinematic (IK) methods recover the parameters of the joints, given the desired position of selected elements in the kinematic chain. While the problem is well-defined and low-dimensional, it has to be solved rapidly, accounting for multiple possible solutions. In this work, we propose a neural IK method that employs the hierarchical structure of the problem to sequentially sample valid joint angles conditioned on the desired position and on the preceding joints along the chain. In our solution, a hypernetwork $f$ recovers the parameters of multiple primary networks {$g_1,g_2,\u2026,g_N$, where $N$ is the number of joints}, such that each $g_i$ outputs a distribution of possible joint angles, and is conditioned on the sampled values obtained from the previous primary networks $g_j, j", "bibtex": "@InProceedings{pmlr-v162-bensadoun22a,\n title = \t {Neural Inverse Kinematic},\n author = {Bensadoun, Raphael and Gur, Shir and Blau, Nitsan and Wolf, Lior},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1787--1797},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bensadoun22a/bensadoun22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bensadoun22a.html},\n abstract = \t {Inverse kinematic (IK) methods recover the parameters of the joints, given the desired position of selected elements in the kinematic chain. While the problem is well-defined and low-dimensional, it has to be solved rapidly, accounting for multiple possible solutions. In this work, we propose a neural IK method that employs the hierarchical structure of the problem to sequentially sample valid joint angles conditioned on the desired position and on the preceding joints along the chain. In our solution, a hypernetwork $f$ recovers the parameters of multiple primary networks {$g_1,g_2,\u2026,g_N$, where $N$ is the number of joints}, such that each $g_i$ outputs a distribution of possible joint angles, and is conditioned on the sampled values obtained from the previous primary networks $g_j, j", "pdf": "https://proceedings.mlr.press/v162/bensadoun22a/bensadoun22a.pdf", "supp": "", "pdf_size": 2027864, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4665029103172955634&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/bensadoun22a.html" }, { "title": "Neural Inverse Transform Sampler", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16601", "id": "16601", "proceeding": "https://proceedings.mlr.press/v162/li22j.html", "poster": "/media/PosterPDFs/ICML%202022/ba51e6158bcaf80fd0d834950251e693.png?t=1657589882.0571983", "slides": "", "author_site": "Henry Li, Yuval Kluger", "author": "Henry Li; Yuval Kluger", "abstract": "Any explicit functional representation $f$ of a density is hampered by two main obstacles when we wish to use it as a generative model: designing $f$ so that sampling is fast, and estimating $Z = \\int f$ so that $Z^{-1}f$ integrates to 1. This becomes increasingly complicated as $f$ itself becomes complicated. In this paper, we show that when modeling one-dimensional conditional densities with a neural network, $Z$ can be exactly and efficiently computed by letting the network represent the cumulative distribution function of a target density, and applying a generalized fundamental theorem of calculus. We also derive a fast algorithm for sampling from the resulting representation by the inverse transform method. By extending these principles to higher dimensions, we introduce the \\textbf{Neural Inverse Transform Sampler (NITS)}, a novel deep learning framework for modeling and sampling from general, multidimensional, compactly-supported probability densities. NITS is a highly expressive density estimator that boasts end-to-end differentiability, fast sampling, and exact and cheap likelihood evaluation. We demonstrate the applicability of NITS by applying it to realistic, high-dimensional density estimation tasks: likelihood-based generative modeling on the CIFAR-10 dataset, and density estimation on the UCI suite of benchmark datasets, where NITS produces compelling results rivaling or surpassing the state of the art.", "bibtex": "@InProceedings{pmlr-v162-li22j,\n title = \t {Neural Inverse Transform Sampler},\n author = {Li, Henry and Kluger, Yuval},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12813--12825},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22j/li22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22j.html},\n abstract = \t {Any explicit functional representation $f$ of a density is hampered by two main obstacles when we wish to use it as a generative model: designing $f$ so that sampling is fast, and estimating $Z = \\int f$ so that $Z^{-1}f$ integrates to 1. This becomes increasingly complicated as $f$ itself becomes complicated. In this paper, we show that when modeling one-dimensional conditional densities with a neural network, $Z$ can be exactly and efficiently computed by letting the network represent the cumulative distribution function of a target density, and applying a generalized fundamental theorem of calculus. We also derive a fast algorithm for sampling from the resulting representation by the inverse transform method. By extending these principles to higher dimensions, we introduce the \\textbf{Neural Inverse Transform Sampler (NITS)}, a novel deep learning framework for modeling and sampling from general, multidimensional, compactly-supported probability densities. NITS is a highly expressive density estimator that boasts end-to-end differentiability, fast sampling, and exact and cheap likelihood evaluation. We demonstrate the applicability of NITS by applying it to realistic, high-dimensional density estimation tasks: likelihood-based generative modeling on the CIFAR-10 dataset, and density estimation on the UCI suite of benchmark datasets, where NITS produces compelling results rivaling or surpassing the state of the art.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22j/li22j.pdf", "supp": "", "pdf_size": 4637357, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3014954787029992873&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": ";", "aff_domain": ";", "email": ";", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/li22j.html" }, { "title": "Neural Language Models are not Born Equal to Fit Brain Data, but Training Helps", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18283", "id": "18283", "proceeding": "https://proceedings.mlr.press/v162/pasquiou22a.html", "poster": "/media/PosterPDFs/ICML%202022/426ea2897becd89515f19117a51c49f1.png?t=1656318882.5463276", "slides": "", "author_site": "Alexandre Pasquiou, Yair Lakretz, John Hale, Thirion Bertrand, Christophe Pallier", "author": "Alexandre Pasquiou; Yair Lakretz; John T Hale; Bertrand Thirion; Christophe Pallier", "abstract": "Neural Language Models (NLMs) have made tremendous advances during the last years, achieving impressive performance on various linguistic tasks. Capitalizing on this, studies in neuroscience have started to use NLMs to study neural activity in the human brain during language processing. However, many questions remain unanswered regarding which factors determine the ability of a neural language model to capture brain activity (aka its \u2019brain score\u2019). Here, we make first steps in this direction and examine the impact of test loss, training corpus and model architecture (comparing GloVe, LSTM, GPT-2 and BERT), on the prediction of functional Magnetic Resonance Imaging time-courses of participants listening to an audiobook. We find that (1) untrained versions of each model already explain significant amount of signal in the brain by capturing similarity in brain responses across identical words, with the untrained LSTM outperforming the transformer-based models, being less impacted by the effect of context; (2) that training NLP models improves brain scores in the same brain regions irrespective of the model\u2019s architecture; (3) that Perplexity (test loss) is not a good predictor of brain score; (4) that training data have a strong influence on the outcome and, notably, that off-the-shelf models may lack statistical power to detect brain activations. Overall, we outline the impact of model-training choices, and suggest good practices for future studies aiming at explaining the human language system using neural language models.", "bibtex": "@InProceedings{pmlr-v162-pasquiou22a,\n title = \t {Neural Language Models are not Born Equal to Fit Brain Data, but Training Helps},\n author = {Pasquiou, Alexandre and Lakretz, Yair and Hale, John T and Thirion, Bertrand and Pallier, Christophe},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17499--17516},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pasquiou22a/pasquiou22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pasquiou22a.html},\n abstract = \t {Neural Language Models (NLMs) have made tremendous advances during the last years, achieving impressive performance on various linguistic tasks. Capitalizing on this, studies in neuroscience have started to use NLMs to study neural activity in the human brain during language processing. However, many questions remain unanswered regarding which factors determine the ability of a neural language model to capture brain activity (aka its \u2019brain score\u2019). Here, we make first steps in this direction and examine the impact of test loss, training corpus and model architecture (comparing GloVe, LSTM, GPT-2 and BERT), on the prediction of functional Magnetic Resonance Imaging time-courses of participants listening to an audiobook. We find that (1) untrained versions of each model already explain significant amount of signal in the brain by capturing similarity in brain responses across identical words, with the untrained LSTM outperforming the transformer-based models, being less impacted by the effect of context; (2) that training NLP models improves brain scores in the same brain regions irrespective of the model\u2019s architecture; (3) that Perplexity (test loss) is not a good predictor of brain score; (4) that training data have a strong influence on the outcome and, notably, that off-the-shelf models may lack statistical power to detect brain activations. Overall, we outline the impact of model-training choices, and suggest good practices for future studies aiming at explaining the human language system using neural language models.}\n}", "pdf": "https://proceedings.mlr.press/v162/pasquiou22a/pasquiou22a.pdf", "supp": "", "pdf_size": 36031175, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13428862531107334035&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Cognitive Neuroimaging Unit, INSERM, CEA, Neurospin, Gif-sur-Yvette, France+Parietal, INRIA, CEA, Neurospin, Gif-sur-Yvette, France; Cognitive Neuroimaging Unit, INSERM, CEA, Neurospin, Gif-sur-Yvette, France+Parietal, INRIA, CEA, Neurospin, Gif-sur-Yvette, France; Dept. of Linguistics, U. of Georgia, Athens, GA, USA; Parietal, INRIA, CEA, Neurospin, Gif-sur-Yvette, France; Cognitive Neuroimaging Unit, INSERM, CEA, Neurospin, Gif-sur-Yvette, France", "aff_domain": "inria.fr; ; ; ; ", "email": "inria.fr; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/pasquiou22a.html", "aff_unique_index": "0+1;0+1;2;1;0", "aff_unique_norm": "INSERM;INRIA;University of Georgia", "aff_unique_dep": "Cognitive Neuroimaging Unit;Parietal;Department of Linguistics", "aff_unique_url": "https://www.inserm.fr;https://www.inria.fr;https://www.uga.edu", "aff_unique_abbr": "INSERM;INRIA;UGA", "aff_campus_unique_index": ";;1", "aff_campus_unique": ";Athens", "aff_country_unique_index": "0+0;0+0;1;0;0", "aff_country_unique": "France;United States" }, { "title": "Neural Laplace: Learning diverse classes of differential equations in the Laplace domain", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16727", "id": "16727", "proceeding": "https://proceedings.mlr.press/v162/holt22a.html", "poster": "/media/PosterPDFs/ICML%202022/35c5a2cb362c4d214156f930e7d13252.png?t=1657660605.3708146", "slides": "", "author_site": "Samuel Holt, Zhaozhi Qian, Mihaela van der Schaar", "author": "Samuel I Holt; Zhaozhi Qian; Mihaela van der Schaar", "abstract": "Neural Ordinary Differential Equations model dynamical systems with ODEs learned by neural networks. However, ODEs are fundamentally inadequate to model systems with long-range dependencies or discontinuities, which are common in engineering and biological systems. Broader classes of differential equations (DE) have been proposed as remedies, including delay differential equations and integro-differential equations. Furthermore, Neural ODE suffers from numerical instability when modelling stiff ODEs and ODEs with piecewise forcing functions. In this work, we propose Neural Laplace, a unifying framework for learning diverse classes of DEs including all the aforementioned ones. Instead of modelling the dynamics in the time domain, we model it in the Laplace domain, where the history-dependencies and discontinuities in time can be represented as summations of complex exponentials. To make learning more efficient, we use the geometrical stereographic map of a Riemann sphere to induce more smoothness in the Laplace domain. In the experiments, Neural Laplace shows superior performance in modelling and extrapolating the trajectories of diverse classes of DEs, including the ones with complex history dependency and abrupt changes.", "bibtex": "@InProceedings{pmlr-v162-holt22a,\n title = \t {Neural Laplace: Learning diverse classes of differential equations in the {L}aplace domain},\n author = {Holt, Samuel I and Qian, Zhaozhi and van der Schaar, Mihaela},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8811--8832},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/holt22a/holt22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/holt22a.html},\n abstract = \t {Neural Ordinary Differential Equations model dynamical systems with ODEs learned by neural networks. However, ODEs are fundamentally inadequate to model systems with long-range dependencies or discontinuities, which are common in engineering and biological systems. Broader classes of differential equations (DE) have been proposed as remedies, including delay differential equations and integro-differential equations. Furthermore, Neural ODE suffers from numerical instability when modelling stiff ODEs and ODEs with piecewise forcing functions. In this work, we propose Neural Laplace, a unifying framework for learning diverse classes of DEs including all the aforementioned ones. Instead of modelling the dynamics in the time domain, we model it in the Laplace domain, where the history-dependencies and discontinuities in time can be represented as summations of complex exponentials. To make learning more efficient, we use the geometrical stereographic map of a Riemann sphere to induce more smoothness in the Laplace domain. In the experiments, Neural Laplace shows superior performance in modelling and extrapolating the trajectories of diverse classes of DEs, including the ones with complex history dependency and abrupt changes.}\n}", "pdf": "https://proceedings.mlr.press/v162/holt22a/holt22a.pdf", "supp": "", "pdf_size": 3009321, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1106313764091437495&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Applied Mathematics and Theoretical Physics, University of Cambridge, UK; Department of Applied Mathematics and Theoretical Physics, University of Cambridge, UK; Department of Applied Mathematics and Theoretical Physics, University of Cambridge, UK", "aff_domain": "cam.ac.uk; ; ", "email": "cam.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/holt22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "Department of Applied Mathematics and Theoretical Physics", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Neural Network Poisson Models for Behavioural and Neural Spike Train Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17239", "id": "17239", "proceeding": "https://proceedings.mlr.press/v162/khajehnejad22a.html", "poster": "/media/PosterPDFs/ICML%202022/ef7a3d1d2f039be1cb6a695f856b5ca6_iiijwwo.png?t=1658115979.9692202", "slides": "/media/icml-2022/Slides/17239.pdf", "author_site": "Moein Khajehnejad, Forough Habibollahi, Richard Nock, Ehsan Arabzadeh, Peter Dayan, Amir Dezfouli", "author": "Moein Khajehnejad; Forough Habibollahi; Richard Nock; Ehsan Arabzadeh; Peter Dayan; Amir Dezfouli", "abstract": "One of the most important and challenging application areas for complex machine learning methods is to predict, characterize and model rich, multi-dimensional, neural data. Recent advances in neural recording techniques have made it possible to monitor the activity of a large number of neurons across different brain regions as animals perform behavioural tasks. This poses the critical challenge of establishing links between neural activity at a microscopic scale, which might for instance represent sensory input, and at a macroscopic scale, which then generates behaviour. Predominant modeling methods apply rather disjoint techniques to these scales; by contrast, we suggest an end-to-end model which exploits recent developments of flexible, but tractable, neural network point-process models to characterize dependencies between stimuli, actions, and neural data. We apply this model to a public dataset collected using Neuropixel probes in mice performing a visually-guided behavioural task as well as a synthetic dataset produced from a hierarchical network model with reciprocally connected sensory and integration circuits intended to characterize animal behaviour in a fixed-duration motion discrimination task. We show that our model outperforms previous approaches and contributes novel insights into the relationships between neural activity and behaviour.", "bibtex": "@InProceedings{pmlr-v162-khajehnejad22a,\n title = \t {Neural Network Poisson Models for Behavioural and Neural Spike Train Data},\n author = {Khajehnejad, Moein and Habibollahi, Forough and Nock, Richard and Arabzadeh, Ehsan and Dayan, Peter and Dezfouli, Amir},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10974--10996},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/khajehnejad22a/khajehnejad22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/khajehnejad22a.html},\n abstract = \t {One of the most important and challenging application areas for complex machine learning methods is to predict, characterize and model rich, multi-dimensional, neural data. Recent advances in neural recording techniques have made it possible to monitor the activity of a large number of neurons across different brain regions as animals perform behavioural tasks. This poses the critical challenge of establishing links between neural activity at a microscopic scale, which might for instance represent sensory input, and at a macroscopic scale, which then generates behaviour. Predominant modeling methods apply rather disjoint techniques to these scales; by contrast, we suggest an end-to-end model which exploits recent developments of flexible, but tractable, neural network point-process models to characterize dependencies between stimuli, actions, and neural data. We apply this model to a public dataset collected using Neuropixel probes in mice performing a visually-guided behavioural task as well as a synthetic dataset produced from a hierarchical network model with reciprocally connected sensory and integration circuits intended to characterize animal behaviour in a fixed-duration motion discrimination task. We show that our model outperforms previous approaches and contributes novel insights into the relationships between neural activity and behaviour.}\n}", "pdf": "https://proceedings.mlr.press/v162/khajehnejad22a/khajehnejad22a.pdf", "supp": "", "pdf_size": 11432479, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6312474160577642394&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Data Science and AI, Faculty of Information Technology, Monash University, Melbourne, Australia+Google Research; Department of Biomedical Engineering, Faculty of Engineering and Information Technology, University of Melbourne, Melbourne, Australia; Google Research; Eccles Institute of Neuroscience, John Curtin School of Medical Research, The Australian National University, Canberra, Australia; MPI for Biological Cybernetics, T\u00fcbingen, Germany+The University of T\u00fcbingen, Germany; Data61, CSIRO, Sydney, Australia", "aff_domain": "monash.edu;unimelb.edu.au;google.com;anu.edu.au;tuebingen.mpg.de;data61.csiro.au", "email": "monash.edu;unimelb.edu.au;google.com;anu.edu.au;tuebingen.mpg.de;data61.csiro.au", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/khajehnejad22a.html", "aff_unique_index": "0+1;2;1;3;4+5;6", "aff_unique_norm": "Monash University;Google;University of Melbourne;Australian National University;Max Planck Institute for Biological Cybernetics;University of T\u00fcbingen;CSIRO", "aff_unique_dep": "Department of Data Science and AI;Google Research;Department of Biomedical Engineering;Eccles Institute of Neuroscience;Biological Cybernetics;;Data61", "aff_unique_url": "https://www.monash.edu;https://research.google;https://www.unimelb.edu.au;https://www.anu.edu.au;https://www.biological-cybernetics.de;https://www.uni-tuebingen.de/;https://www.csiro.au", "aff_unique_abbr": "Monash;Google Research;UniMelb;ANU;MPIBC;Uni T\u00fcbingen;CSIRO", "aff_campus_unique_index": "0+1;0;1;2;3;5", "aff_campus_unique": "Melbourne;Mountain View;Canberra;T\u00fcbingen;;Sydney", "aff_country_unique_index": "0+1;0;1;0;2+2;0", "aff_country_unique": "Australia;United States;Germany" }, { "title": "Neural Network Pruning Denoises the Features and Makes Local Connectivity Emerge in Visual Tasks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16071", "id": "16071", "proceeding": "https://proceedings.mlr.press/v162/pellegrini22a.html", "poster": "/media/PosterPDFs/ICML%202022/07cb5f86508f146774a2fac4373a8e50.png?t=1657549668.1291115", "slides": "", "author_site": "Franco Pellegrini, Giulio Biroli", "author": "Franco Pellegrini; Giulio Biroli", "abstract": "Pruning methods can considerably reduce the size of artificial neural networks without harming their performance and in some cases they can even uncover sub-networks that, when trained in isolation, match or surpass the test accuracy of their dense counterparts. Here, we characterize the inductive bias that pruning imprints in such \"winning lottery tickets\": focusing on visual tasks, we analyze the architecture resulting from iterative magnitude pruning of a simple fully connected network. We show that the surviving node connectivity is local in input space, and organized in patterns reminiscent of the ones found in convolutional networks. We investigate the role played by data and tasks in shaping the architecture of the pruned sub-network. We find that pruning performances, and the ability to sift out the noise and make local features emerge, improve by increasing the size of the training set, and the semantic value of the data. We also study different pruning procedures, and find that iterative magnitude pruning is particularly effective in distilling meaningful connectivity out of features present in the original task. Our results suggest the possibility to automatically discover new and efficient architectural inductive biases in other datasets and tasks.", "bibtex": "@InProceedings{pmlr-v162-pellegrini22a,\n title = \t {Neural Network Pruning Denoises the Features and Makes Local Connectivity Emerge in Visual Tasks},\n author = {Pellegrini, Franco and Biroli, Giulio},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17601--17626},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pellegrini22a/pellegrini22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pellegrini22a.html},\n abstract = \t {Pruning methods can considerably reduce the size of artificial neural networks without harming their performance and in some cases they can even uncover sub-networks that, when trained in isolation, match or surpass the test accuracy of their dense counterparts. Here, we characterize the inductive bias that pruning imprints in such \"winning lottery tickets\": focusing on visual tasks, we analyze the architecture resulting from iterative magnitude pruning of a simple fully connected network. We show that the surviving node connectivity is local in input space, and organized in patterns reminiscent of the ones found in convolutional networks. We investigate the role played by data and tasks in shaping the architecture of the pruned sub-network. We find that pruning performances, and the ability to sift out the noise and make local features emerge, improve by increasing the size of the training set, and the semantic value of the data. We also study different pruning procedures, and find that iterative magnitude pruning is particularly effective in distilling meaningful connectivity out of features present in the original task. Our results suggest the possibility to automatically discover new and efficient architectural inductive biases in other datasets and tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/pellegrini22a/pellegrini22a.pdf", "supp": "", "pdf_size": 4378812, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3749849351915122545&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Laboratoire de Physique de l\u2019\u00c9cole normale sup\u00e9rieure, ENS, Universit\u00e9 PSL, CNRS, Sorbonne Universit\u00e9, Universit\u00e9 de Paris \u2014 F-75005 Paris, France; Laboratoire de Physique de l\u2019\u00c9cole normale sup\u00e9rieure, ENS, Universit\u00e9 PSL, CNRS, Sorbonne Universit\u00e9, Universit\u00e9 de Paris \u2014 F-75005 Paris, France", "aff_domain": "phys.ens.fr; ", "email": "phys.ens.fr; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/pellegrini22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "\u00c9cole Normale Sup\u00e9rieure", "aff_unique_dep": "Laboratoire de Physique", "aff_unique_url": "https://www.ens.fr", "aff_unique_abbr": "ENS", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Neural Network Weights Do Not Converge to Stationary Points: An Invariant Measure Perspective", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16131", "id": "16131", "proceeding": "https://proceedings.mlr.press/v162/zhang22q.html", "poster": "/media/PosterPDFs/ICML%202022/41ab1b1d6bf108f388dfb5cd282fb76c.png?t=1658193673.970524", "slides": "", "author_site": "Jingzhao Zhang, Haochuan Li, Suvrit Sra, Ali Jadbabaie", "author": "Jingzhao Zhang; Haochuan Li; Suvrit Sra; Ali Jadbabaie", "abstract": "This work examines the deep disconnect between existing theoretical analyses of gradient-based algorithms and the practice of training deep neural networks. Specifically, we provide numerical evidence that in large-scale neural network training (e.g., ImageNet + ResNet101, and WT103 + TransformerXL models), the neural network\u2019s weights do not converge to stationary points where the gradient of the loss is zero. Remarkably, however, we observe that even though the weights do not converge to stationary points, the progress in minimizing the loss function halts and training loss stabilizes. Inspired by this observation, we propose a new perspective based on ergodic theory of dynamical systems to explain it. Rather than studying the evolution of weights, we study the evolution of the distribution of weights. We prove convergence of the distribution of weights to an approximate invariant measure, thereby explaining how the training loss can stabilize without weights necessarily converging to stationary points. We further discuss how this perspective can better align optimization theory with empirical observations in machine learning practice.", "bibtex": "@InProceedings{pmlr-v162-zhang22q,\n title = \t {Neural Network Weights Do Not Converge to Stationary Points: An Invariant Measure Perspective},\n author = {Zhang, Jingzhao and Li, Haochuan and Sra, Suvrit and Jadbabaie, Ali},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26330--26346},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22q/zhang22q.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22q.html},\n abstract = \t {This work examines the deep disconnect between existing theoretical analyses of gradient-based algorithms and the practice of training deep neural networks. Specifically, we provide numerical evidence that in large-scale neural network training (e.g., ImageNet + ResNet101, and WT103 + TransformerXL models), the neural network\u2019s weights do not converge to stationary points where the gradient of the loss is zero. Remarkably, however, we observe that even though the weights do not converge to stationary points, the progress in minimizing the loss function halts and training loss stabilizes. Inspired by this observation, we propose a new perspective based on ergodic theory of dynamical systems to explain it. Rather than studying the evolution of weights, we study the evolution of the distribution of weights. We prove convergence of the distribution of weights to an approximate invariant measure, thereby explaining how the training loss can stabilize without weights necessarily converging to stationary points. We further discuss how this perspective can better align optimization theory with empirical observations in machine learning practice.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22q/zhang22q.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/zhang22q-supp.zip", "pdf_size": 1107228, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8069424644207448552&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "IIIS, Tsinghua University; Massachusetts Institute of Technology; Massachusetts Institute of Technology; Massachusetts Institute of Technology", "aff_domain": "mail.tsinghua.edu.cn;mit.edu; ; ", "email": "mail.tsinghua.edu.cn;mit.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhang22q.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Tsinghua University;Massachusetts Institute of Technology", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://web.mit.edu", "aff_unique_abbr": "THU;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "China;United States" }, { "title": "Neural Tangent Kernel Analysis of Deep Narrow Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17535", "id": "17535", "proceeding": "https://proceedings.mlr.press/v162/lee22a.html", "poster": "/media/PosterPDFs/ICML%202022/a8baa56554f96369ab93e4f3bb068c22.png?t=1657872382.302389", "slides": "", "author_site": "Jongmin Lee, Joo Young Choi, Ernest Ryu, Albert No", "author": "Jongmin Lee; Joo Young Choi; Ernest K Ryu; Albert No", "abstract": "The tremendous recent progress in analyzing the training dynamics of overparameterized neural networks has primarily focused on wide networks and therefore does not sufficiently address the role of depth in deep learning. In this work, we present the first trainability guarantee of infinitely deep but narrow neural networks. We study the infinite-depth limit of a multilayer perceptron (MLP) with a specific initialization and establish a trainability guarantee using the NTK theory. We then extend the analysis to an infinitely deep convolutional neural network (CNN) and perform brief experiments.", "bibtex": "@InProceedings{pmlr-v162-lee22a,\n title = \t {Neural Tangent Kernel Analysis of Deep Narrow Neural Networks},\n author = {Lee, Jongmin and Choi, Joo Young and Ryu, Ernest K and No, Albert},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12282--12351},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lee22a/lee22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lee22a.html},\n abstract = \t {The tremendous recent progress in analyzing the training dynamics of overparameterized neural networks has primarily focused on wide networks and therefore does not sufficiently address the role of depth in deep learning. In this work, we present the first trainability guarantee of infinitely deep but narrow neural networks. We study the infinite-depth limit of a multilayer perceptron (MLP) with a specific initialization and establish a trainability guarantee using the NTK theory. We then extend the analysis to an infinitely deep convolutional neural network (CNN) and perform brief experiments.}\n}", "pdf": "https://proceedings.mlr.press/v162/lee22a/lee22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/lee22a-supp.zip", "pdf_size": 735047, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11344426025520591295&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Mathematical Sciences, Seoul National University, Seoul, Korea; Department of Mathematical Sciences, Seoul National University, Seoul, Korea; Department of Mathematical Sciences, Seoul National University, Seoul, Korea; Department of Electronic and Electrical Engineering, Hongik University, Seoul, Korea", "aff_domain": "snu.ac.kr; ; ;hongik.ac.kr", "email": "snu.ac.kr; ; ;hongik.ac.kr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lee22a.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Seoul National University;Hongik University", "aff_unique_dep": "Department of Mathematical Sciences;Department of Electronic and Electrical Engineering", "aff_unique_url": "https://www.snu.ac.kr;http://www.hongik.ac.kr", "aff_unique_abbr": "SNU;Hongik", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Neural Tangent Kernel Beyond the Infinite-Width Limit: Effects of Depth and Initialization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16473", "id": "16473", "proceeding": "https://proceedings.mlr.press/v162/seleznova22a.html", "poster": "/media/PosterPDFs/ICML%202022/45624a44b89793087e9ef4d076018adb.png?t=1658075629.7220912", "slides": "/media/icml-2022/Slides/16473.pdf", "author_site": "Mariia Seleznova, Gitta Kutyniok", "author": "Mariia Seleznova; Gitta Kutyniok", "abstract": "Neural Tangent Kernel (NTK) is widely used to analyze overparametrized neural networks due to the famous result by Jacot et al. (2018): in the infinite-width limit, the NTK is deterministic and constant during training. However, this result cannot explain the behavior of deep networks, since it generally does not hold if depth and width tend to infinity simultaneously. In this paper, we study the NTK of fully-connected ReLU networks with depth comparable to width. We prove that the NTK properties depend significantly on the depth-to-width ratio and the distribution of parameters at initialization. In fact, our results indicate the importance of the three phases in the hyperparameter space identified in Poole et al. (2016): ordered, chaotic and the edge of chaos (EOC). We derive exact expressions for the NTK dispersion in the infinite-depth-and-width limit in all three phases and conclude that the NTK variability grows exponentially with depth at the EOC and in the chaotic phase but not in the ordered phase. We also show that the NTK of deep networks may stay constant during training only in the ordered phase and discuss how the structure of the NTK matrix changes during training.", "bibtex": "@InProceedings{pmlr-v162-seleznova22a,\n title = \t {Neural Tangent Kernel Beyond the Infinite-Width Limit: Effects of Depth and Initialization},\n author = {Seleznova, Mariia and Kutyniok, Gitta},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19522--19560},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/seleznova22a/seleznova22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/seleznova22a.html},\n abstract = \t {Neural Tangent Kernel (NTK) is widely used to analyze overparametrized neural networks due to the famous result by Jacot et al. (2018): in the infinite-width limit, the NTK is deterministic and constant during training. However, this result cannot explain the behavior of deep networks, since it generally does not hold if depth and width tend to infinity simultaneously. In this paper, we study the NTK of fully-connected ReLU networks with depth comparable to width. We prove that the NTK properties depend significantly on the depth-to-width ratio and the distribution of parameters at initialization. In fact, our results indicate the importance of the three phases in the hyperparameter space identified in Poole et al. (2016): ordered, chaotic and the edge of chaos (EOC). We derive exact expressions for the NTK dispersion in the infinite-depth-and-width limit in all three phases and conclude that the NTK variability grows exponentially with depth at the EOC and in the chaotic phase but not in the ordered phase. We also show that the NTK of deep networks may stay constant during training only in the ordered phase and discuss how the structure of the NTK matrix changes during training.}\n}", "pdf": "https://proceedings.mlr.press/v162/seleznova22a/seleznova22a.pdf", "supp": "", "pdf_size": 3241504, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16495366436833298314&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Mathematics, Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen, Munich, Germany; Department of Mathematics, Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen, Munich, Germany", "aff_domain": "math.lmu.de; ", "email": "math.lmu.de; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/seleznova22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Ludwig-Maximilians-Universit\u00e4t M\u00fcnchen", "aff_unique_dep": "Department of Mathematics", "aff_unique_url": "https://www.lmu.de", "aff_unique_abbr": "LMU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Munich", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Neural Tangent Kernel Empowered Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16731", "id": "16731", "proceeding": "https://proceedings.mlr.press/v162/yue22a.html", "poster": "/media/PosterPDFs/ICML%202022/3dea6b598a16b334a53145e78701fa87.png?t=1657310459.4644346", "slides": "", "author_site": "Kai Yue, Richeng Jin, Ryan Pilgrim, Chau-Wai Wong, Dror Baron, Huaiyu Dai", "author": "Kai Yue; Richeng Jin; Ryan Pilgrim; Chau-Wai Wong; Dror Baron; Huaiyu Dai", "abstract": "Federated learning (FL) is a privacy-preserving paradigm where multiple participants jointly solve a machine learning problem without sharing raw data. Unlike traditional distributed learning, a unique characteristic of FL is statistical heterogeneity, namely, data distributions across participants are different from each other. Meanwhile, recent advances in the interpretation of neural networks have seen a wide use of neural tangent kernels (NTKs) for convergence analyses. In this paper, we propose a novel FL paradigm empowered by the NTK framework. The paradigm addresses the challenge of statistical heterogeneity by transmitting update data that are more expressive than those of the conventional FL paradigms. Specifically, sample-wise Jacobian matrices, rather than model weights/gradients, are uploaded by participants. The server then constructs an empirical kernel matrix to update a global model without explicitly performing gradient descent. We further develop a variant with improved communication efficiency and enhanced privacy. Numerical results show that the proposed paradigm can achieve the same accuracy while reducing the number of communication rounds by an order of magnitude compared to federated averaging.", "bibtex": "@InProceedings{pmlr-v162-yue22a,\n title = \t {Neural Tangent Kernel Empowered Federated Learning},\n author = {Yue, Kai and Jin, Richeng and Pilgrim, Ryan and Wong, Chau-Wai and Baron, Dror and Dai, Huaiyu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25783--25803},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yue22a/yue22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yue22a.html},\n abstract = \t {Federated learning (FL) is a privacy-preserving paradigm where multiple participants jointly solve a machine learning problem without sharing raw data. Unlike traditional distributed learning, a unique characteristic of FL is statistical heterogeneity, namely, data distributions across participants are different from each other. Meanwhile, recent advances in the interpretation of neural networks have seen a wide use of neural tangent kernels (NTKs) for convergence analyses. In this paper, we propose a novel FL paradigm empowered by the NTK framework. The paradigm addresses the challenge of statistical heterogeneity by transmitting update data that are more expressive than those of the conventional FL paradigms. Specifically, sample-wise Jacobian matrices, rather than model weights/gradients, are uploaded by participants. The server then constructs an empirical kernel matrix to update a global model without explicitly performing gradient descent. We further develop a variant with improved communication efficiency and enhanced privacy. Numerical results show that the proposed paradigm can achieve the same accuracy while reducing the number of communication rounds by an order of magnitude compared to federated averaging.}\n}", "pdf": "https://proceedings.mlr.press/v162/yue22a/yue22a.pdf", "supp": "", "pdf_size": 1260243, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5964318838593905351&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "NC State University; NC State University; Independent Scholar; NC State University; NC State University; NC State University", "aff_domain": "ncsu.edu; ; ; ; ; ", "email": "ncsu.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/yue22a.html", "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "North Carolina State University;Independent Scholar", "aff_unique_dep": ";", "aff_unique_url": "https://www.ncsu.edu;", "aff_unique_abbr": "NC State;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States;" }, { "title": "Neural-Symbolic Models for Logical Queries on Knowledge Graphs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16233", "id": "16233", "proceeding": "https://proceedings.mlr.press/v162/zhu22c.html", "poster": "/media/PosterPDFs/ICML%202022/07a9d3fed4c5ea6b17e80258dee231fa.png?t=1656703451.2128139", "slides": "/media/icml-2022/Slides/16233.pdf", "author_site": "Zhaocheng Zhu, Mikhail Galkin, Zuobai Zhang, Jian Tang", "author": "Zhaocheng Zhu; Mikhail Galkin; Zuobai Zhang; Jian Tang", "abstract": "Answering complex first-order logic (FOL) queries on knowledge graphs is a fundamental task for multi-hop reasoning. Traditional symbolic methods traverse a complete knowledge graph to extract the answers, which provides good interpretation for each step. Recent neural methods learn geometric embeddings for complex queries. These methods can generalize to incomplete knowledge graphs, but their reasoning process is hard to interpret. In this paper, we propose Graph Neural Network Query Executor (GNN-QE), a neural-symbolic model that enjoys the advantages of both worlds. GNN-QE decomposes a complex FOL query into relation projections and logical operations over fuzzy sets, which provides interpretability for intermediate variables. To reason about the missing links, GNN-QE adapts a graph neural network from knowledge graph completion to execute the relation projections, and models the logical operations with product fuzzy logic. Experiments on 3 datasets show that GNN-QE significantly improves over previous state-of-the-art models in answering FOL queries. Meanwhile, GNN-QE can predict the number of answers without explicit supervision, and provide visualizations for intermediate variables.", "bibtex": "@InProceedings{pmlr-v162-zhu22c,\n title = \t {Neural-Symbolic Models for Logical Queries on Knowledge Graphs},\n author = {Zhu, Zhaocheng and Galkin, Mikhail and Zhang, Zuobai and Tang, Jian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27454--27478},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhu22c/zhu22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhu22c.html},\n abstract = \t {Answering complex first-order logic (FOL) queries on knowledge graphs is a fundamental task for multi-hop reasoning. Traditional symbolic methods traverse a complete knowledge graph to extract the answers, which provides good interpretation for each step. Recent neural methods learn geometric embeddings for complex queries. These methods can generalize to incomplete knowledge graphs, but their reasoning process is hard to interpret. In this paper, we propose Graph Neural Network Query Executor (GNN-QE), a neural-symbolic model that enjoys the advantages of both worlds. GNN-QE decomposes a complex FOL query into relation projections and logical operations over fuzzy sets, which provides interpretability for intermediate variables. To reason about the missing links, GNN-QE adapts a graph neural network from knowledge graph completion to execute the relation projections, and models the logical operations with product fuzzy logic. Experiments on 3 datasets show that GNN-QE significantly improves over previous state-of-the-art models in answering FOL queries. Meanwhile, GNN-QE can predict the number of answers without explicit supervision, and provide visualizations for intermediate variables.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhu22c/zhu22c.pdf", "supp": "", "pdf_size": 881713, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2755509975751664011&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Mila - Qu\u00b4ebec AI Institute+Universit\u00b4e de Montr\u00b4eal+McGill University; Mila - Qu\u00b4ebec AI Institute+Universit\u00b4e de Montr\u00b4eal; Mila - Qu\u00b4ebec AI Institute+Universit\u00b4e de Montr\u00b4eal+McGill University; HEC Montr\u00b4eal+CIFAR AI Chair", "aff_domain": "umontreal.ca; ; ;hec.ca", "email": "umontreal.ca; ; ;hec.ca", "github": "https://github.com/DeepGraphLearning/GNN-QE", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhu22c.html", "aff_unique_index": "0+1+2;0+1;0+1+2;3+4", "aff_unique_norm": "Mila - Quebec AI Institute;Universit\u00e9 de Montr\u00e9al;McGill University;HEC Montr\u00e9al;CIFAR", "aff_unique_dep": "Quebec AI Institute;;;;AI Chair", "aff_unique_url": "https://mila.quebec;https://www.umontreal.ca;https://www.mcgill.ca;https://www.hec.ca;https://www.cifar.ca", "aff_unique_abbr": "Mila;UdeM;McGill;HEC;CIFAR", "aff_campus_unique_index": ";;;1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0+0+0;0+0;0+0+0;0+0", "aff_country_unique": "Canada" }, { "title": "NeuralEF: Deconstructing Kernels by Deep Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18261", "id": "18261", "proceeding": "https://proceedings.mlr.press/v162/deng22b.html", "poster": "/media/PosterPDFs/ICML%202022/565030e1fce4e481f9823a7de3b8a047_V8rTgoV.png?t=1657160126.4968808", "slides": "/media/icml-2022/Slides/18261.pdf", "author_site": "Zhijie Deng, Jiaxin Shi, Jun Zhu", "author": "Zhijie Deng; Jiaxin Shi; Jun Zhu", "abstract": "Learning the principal eigenfunctions of an integral operator defined by a kernel and a data distribution is at the core of many machine learning problems. Traditional nonparametric solutions based on the Nystrom formula suffer from scalability issues. Recent work has resorted to a parametric approach, i.e., training neural networks to approximate the eigenfunctions. However, the existing method relies on an expensive orthogonalization step and is difficult to implement. We show that these problems can be fixed by using a new series of objective functions that generalizes the EigenGame to function space. We test our method on a variety of supervised and unsupervised learning problems and show it provides accurate approximations to the eigenfunctions of polynomial, radial basis, neural network Gaussian process, and neural tangent kernels. Finally, we demonstrate our method can scale up linearised Laplace approximation of deep neural networks to modern image classification datasets through approximating the Gauss-Newton matrix. Code is available at https://github.com/thudzj/neuraleigenfunction.", "bibtex": "@InProceedings{pmlr-v162-deng22b,\n title = \t {{N}eural{EF}: Deconstructing Kernels by Deep Neural Networks},\n author = {Deng, Zhijie and Shi, Jiaxin and Zhu, Jun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4976--4992},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/deng22b/deng22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/deng22b.html},\n abstract = \t {Learning the principal eigenfunctions of an integral operator defined by a kernel and a data distribution is at the core of many machine learning problems. Traditional nonparametric solutions based on the Nystrom formula suffer from scalability issues. Recent work has resorted to a parametric approach, i.e., training neural networks to approximate the eigenfunctions. However, the existing method relies on an expensive orthogonalization step and is difficult to implement. We show that these problems can be fixed by using a new series of objective functions that generalizes the EigenGame to function space. We test our method on a variety of supervised and unsupervised learning problems and show it provides accurate approximations to the eigenfunctions of polynomial, radial basis, neural network Gaussian process, and neural tangent kernels. Finally, we demonstrate our method can scale up linearised Laplace approximation of deep neural networks to modern image classification datasets through approximating the Gauss-Newton matrix. Code is available at https://github.com/thudzj/neuraleigenfunction.}\n}", "pdf": "https://proceedings.mlr.press/v162/deng22b/deng22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/deng22b-supp.zip", "pdf_size": 3852634, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14961387103388663924&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Dept. of Comp. Sci. & Tech., BNRist Center, Tsinghua-Bosch Joint Center for ML, Tsinghua University + Peng Cheng Laboratory; Microsoft Research New England; Dept. of Comp. Sci. & Tech., BNRist Center, Tsinghua-Bosch Joint Center for ML, Tsinghua University", "aff_domain": "tsinghua.edu.cn; ;tsinghua.edu.cn", "email": "tsinghua.edu.cn; ;tsinghua.edu.cn", "github": "https://github.com/thudzj/neuraleigenfunction", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/deng22b.html", "aff_unique_index": "0+1;2;0", "aff_unique_norm": "Tsinghua University;Pengcheng Laboratory;Microsoft", "aff_unique_dep": "Dept. of Comp. Sci. & Tech.;Peng Cheng Laboratory;Microsoft Research", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.pcl.ac.cn;https://www.microsoft.com/en-us/research/group/microsoft-research-new-england", "aff_unique_abbr": "THU;PCL;MSR NE", "aff_campus_unique_index": ";1", "aff_campus_unique": ";New England", "aff_country_unique_index": "0+0;1;0", "aff_country_unique": "China;United States" }, { "title": "Neuro-Symbolic Hierarchical Rule Induction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17187", "id": "17187", "proceeding": "https://proceedings.mlr.press/v162/glanois22a.html", "poster": "/media/PosterPDFs/ICML%202022/16c222aa19898e5058938167c8ab6c57_WuVOimn.png?t=1657870369.35803", "slides": "", "author_site": "Claire Glanois, Zhaohui Jiang, Xuening Feng, Paul Weng, Matthieu Zimmer, Dong Li, Wulong Liu, Jianye Hao", "author": "Claire Glanois; Zhaohui Jiang; Xuening Feng; Paul Weng; Matthieu Zimmer; Dong Li; Wulong Liu; Jianye Hao", "abstract": "We propose Neuro-Symbolic Hierarchical Rule Induction, an efficient interpretable neuro-symbolic model, to solve Inductive Logic Programming (ILP) problems. In this model, which is built from a pre-defined set of meta-rules organized in a hierarchical structure, first-order rules are invented by learning embeddings to match facts and body predicates of a meta-rule. To instantiate, we specifically design an expressive set of generic meta-rules, and demonstrate they generate a consequent fragment of Horn clauses. As a differentiable model, HRI can be trained both via supervised learning and reinforcement learning. To converge to interpretable rules, we inject a controlled noise to avoid local optima and employ an interpretability-regularization term. We empirically validate our model on various tasks (ILP, visual genome, reinforcement learning) against relevant state-of-the-art methods, including traditional ILP methods and neuro-symbolic models.", "bibtex": "@InProceedings{pmlr-v162-glanois22a,\n title = \t {Neuro-Symbolic Hierarchical Rule Induction},\n author = {Glanois, Claire and Jiang, Zhaohui and Feng, Xuening and Weng, Paul and Zimmer, Matthieu and Li, Dong and Liu, Wulong and Hao, Jianye},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7583--7615},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/glanois22a/glanois22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/glanois22a.html},\n abstract = \t {We propose Neuro-Symbolic Hierarchical Rule Induction, an efficient interpretable neuro-symbolic model, to solve Inductive Logic Programming (ILP) problems. In this model, which is built from a pre-defined set of meta-rules organized in a hierarchical structure, first-order rules are invented by learning embeddings to match facts and body predicates of a meta-rule. To instantiate, we specifically design an expressive set of generic meta-rules, and demonstrate they generate a consequent fragment of Horn clauses. As a differentiable model, HRI can be trained both via supervised learning and reinforcement learning. To converge to interpretable rules, we inject a controlled noise to avoid local optima and employ an interpretability-regularization term. We empirically validate our model on various tasks (ILP, visual genome, reinforcement learning) against relevant state-of-the-art methods, including traditional ILP methods and neuro-symbolic models.}\n}", "pdf": "https://proceedings.mlr.press/v162/glanois22a/glanois22a.pdf", "supp": "", "pdf_size": 1522108, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18280917446371492557&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "IT University of Copenhagen, Denmark; UM-SJTU Joint Institute, Shanghai Jiao Tong University, Shanghai, China; UM-SJTU Joint Institute, Shanghai Jiao Tong University, Shanghai, China; UM-SJTU Joint Institute, Shanghai Jiao Tong University, Shanghai, China; UM-SJTU Joint Institute, Shanghai Jiao Tong University, Shanghai, China; Huawei Noah\u2019s Ark Lab, China; Huawei Noah\u2019s Ark Lab, China; Huawei Noah\u2019s Ark Lab, China+School of Computing and Intelligence, Tianjin University", "aff_domain": "sjtu.edu.cn; ; ; ; ; ; ; ", "email": "sjtu.edu.cn; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/glanois22a.html", "aff_unique_index": "0;1;1;1;1;2;2;2+3", "aff_unique_norm": "IT University of Copenhagen;Shanghai Jiao Tong University;Huawei;Tianjin University", "aff_unique_dep": ";UM-SJTU Joint Institute;Huawei Noah\u2019s Ark Lab;School of Computing and Intelligence", "aff_unique_url": "https://itu.dk;https://en.sjtu.edu.cn;https://www.huawei.com/en/ai/noahs-ark-lab;http://www.tju.edu.cn", "aff_unique_abbr": "ITU;SJTU;HNAL;Tianjin U", "aff_campus_unique_index": "1;1;1;1;", "aff_campus_unique": ";Shanghai", "aff_country_unique_index": "0;1;1;1;1;1;1;1+1", "aff_country_unique": "Denmark;China" }, { "title": "Neuro-Symbolic Language Modeling with Automaton-augmented Retrieval", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16305", "id": "16305", "proceeding": "https://proceedings.mlr.press/v162/alon22a.html", "poster": "/media/PosterPDFs/ICML%202022/20c86a628232a67e7bd46f76fba7ce12_zrDhUbL.png?t=1658280503.8939345", "slides": "/media/icml-2022/Slides/16305.pdf", "author_site": "Uri Alon, Frank Xu, Junxian He, Sudipta Sengupta, Dan Roth, Graham Neubig", "author": "Uri Alon; Frank Xu; Junxian He; Sudipta Sengupta; Dan Roth; Graham Neubig", "abstract": "Retrieval-based language models (R-LM) model the probability of natural language text by combining a standard language model (LM) with examples retrieved from an external datastore at test time. While effective, a major bottleneck of using these models in practice is the computationally costly datastore search, which can be performed as frequently as every time step. In this paper, we present RetoMaton - retrieval automaton - which approximates the datastore search, based on (1) saving pointers between consecutive datastore entries, and (2) clustering of entries into \"states\". This effectively results in a weighted finite automaton built on top of the datastore, instead of representing the datastore as a flat list. The creation of the automaton is unsupervised, and a RetoMaton can be constructed from any text collection: either the original training corpus or from another domain. Traversing this automaton at inference time, in parallel to the LM inference, reduces its perplexity by up to 1.85, or alternatively saves up to 83% of the nearest neighbor searches over $k$NN-LM (Khandelwal et al., 2020) without hurting perplexity. Our code and trained models are available at https://github.com/neulab/retomaton .", "bibtex": "@InProceedings{pmlr-v162-alon22a,\n title = \t {Neuro-Symbolic Language Modeling with Automaton-augmented Retrieval},\n author = {Alon, Uri and Xu, Frank and He, Junxian and Sengupta, Sudipta and Roth, Dan and Neubig, Graham},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {468--485},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/alon22a/alon22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/alon22a.html},\n abstract = \t {Retrieval-based language models (R-LM) model the probability of natural language text by combining a standard language model (LM) with examples retrieved from an external datastore at test time. While effective, a major bottleneck of using these models in practice is the computationally costly datastore search, which can be performed as frequently as every time step. In this paper, we present RetoMaton - retrieval automaton - which approximates the datastore search, based on (1) saving pointers between consecutive datastore entries, and (2) clustering of entries into \"states\". This effectively results in a weighted finite automaton built on top of the datastore, instead of representing the datastore as a flat list. The creation of the automaton is unsupervised, and a RetoMaton can be constructed from any text collection: either the original training corpus or from another domain. Traversing this automaton at inference time, in parallel to the LM inference, reduces its perplexity by up to 1.85, or alternatively saves up to 83% of the nearest neighbor searches over $k$NN-LM (Khandelwal et al., 2020) without hurting perplexity. Our code and trained models are available at https://github.com/neulab/retomaton .}\n}", "pdf": "https://proceedings.mlr.press/v162/alon22a/alon22a.pdf", "supp": "", "pdf_size": 526193, "gs_citation": 73, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6129911060796583281&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Language Technologies Institute, Carnegie Mellon University; Language Technologies Institute, Carnegie Mellon University; Language Technologies Institute, Carnegie Mellon University; Amazon AWS; AWS AI Labs; Language Technologies Institute, Carnegie Mellon University", "aff_domain": "cs.cmu.edu;cs.cmu.edu;cs.cmu.edu;amazon.com;amazon.com;cs.cmu.edu", "email": "cs.cmu.edu;cs.cmu.edu;cs.cmu.edu;amazon.com;amazon.com;cs.cmu.edu", "github": "https://github.com/neulab/retomaton", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/alon22a.html", "aff_unique_index": "0;0;0;1;1;0", "aff_unique_norm": "Carnegie Mellon University;Amazon", "aff_unique_dep": "Language Technologies Institute;Amazon Web Services", "aff_unique_url": "https://www.cmu.edu;https://aws.amazon.com", "aff_unique_abbr": "CMU;AWS", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "NeuroFluid: Fluid Dynamics Grounding with Particle-Driven Neural Radiance Fields", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18385", "id": "18385", "proceeding": "https://proceedings.mlr.press/v162/guan22a.html", "poster": "/media/PosterPDFs/ICML%202022/85267d349a5e647ff0a9edcb5ffd1e02_eDmkzYk.png?t=1656610120.2918684", "slides": "/media/icml-2022/Slides/18385.pdf", "author_site": "Shanyan Guan, Huayu Deng, Yunbo Wang, Xiaokang Yang", "author": "Shanyan Guan; Huayu Deng; Yunbo Wang; Xiaokang Yang", "abstract": "Deep learning has shown great potential for modeling the physical dynamics of complex particle systems such as fluids. Existing approaches, however, require the supervision of consecutive particle properties, including positions and velocities. In this paper, we consider a partially observable scenario known as fluid dynamics grounding, that is, inferring the state transitions and interactions within the fluid particle systems from sequential visual observations of the fluid surface. We propose a differentiable two-stage network named NeuroFluid. Our approach consists of (i) a particle-driven neural renderer, which involves fluid physical properties into the volume rendering function, and (ii) a particle transition model optimized to reduce the differences between the rendered and the observed images. NeuroFluid provides the first solution to unsupervised learning of particle-based fluid dynamics by training these two models jointly. It is shown to reasonably estimate the underlying physics of fluids with different initial shapes, viscosity, and densities.", "bibtex": "@InProceedings{pmlr-v162-guan22a,\n title = \t {{N}euro{F}luid: Fluid Dynamics Grounding with Particle-Driven Neural Radiance Fields},\n author = {Guan, Shanyan and Deng, Huayu and Wang, Yunbo and Yang, Xiaokang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7919--7929},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guan22a/guan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/guan22a.html},\n abstract = \t {Deep learning has shown great potential for modeling the physical dynamics of complex particle systems such as fluids. Existing approaches, however, require the supervision of consecutive particle properties, including positions and velocities. In this paper, we consider a partially observable scenario known as fluid dynamics grounding, that is, inferring the state transitions and interactions within the fluid particle systems from sequential visual observations of the fluid surface. We propose a differentiable two-stage network named NeuroFluid. Our approach consists of (i) a particle-driven neural renderer, which involves fluid physical properties into the volume rendering function, and (ii) a particle transition model optimized to reduce the differences between the rendered and the observed images. NeuroFluid provides the first solution to unsupervised learning of particle-based fluid dynamics by training these two models jointly. It is shown to reasonably estimate the underlying physics of fluids with different initial shapes, viscosity, and densities.}\n}", "pdf": "https://proceedings.mlr.press/v162/guan22a/guan22a.pdf", "supp": "", "pdf_size": 4674577, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2100037275544618256&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai 200240, China; MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai 200240, China; MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai 200240, China; MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University, Shanghai 200240, China", "aff_domain": "sjtu.edu.cn; ; ; ", "email": "sjtu.edu.cn; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/guan22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "AI Institute", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Neurocoder: General-Purpose Computation Using Stored Neural Programs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17707", "id": "17707", "proceeding": "https://proceedings.mlr.press/v162/le22b.html", "poster": "/media/PosterPDFs/ICML%202022/8597a6cfa74defcbde3047c891d78f90.png?t=1657160381.0355704", "slides": "", "author_site": "Hung Le, Svetha Venkatesh", "author": "Hung Le; Svetha Venkatesh", "abstract": "Artificial Neural Networks are functionally equivalent to special-purpose computers. Their inter-neuronal connection weights represent the learnt Neural Program that instructs the networks on how to compute the data. However, without storing Neural Programs, they are restricted to only one, overwriting learnt programs when trained on new data. Here we design Neurocoder, a new class of general-purpose neural networks in which the neural network \u201ccodes\u201d itself in a data-responsive way by composing relevant programs from a set of shareable, modular programs stored in external memory. This time, a Neural Program is efficiently treated as data in memory. Integrating Neurocoder into current neural architectures, we demonstrate new capacity to learn modular programs, reuse simple programs to build complex ones, handle pattern shifts and remember old programs as new ones are learnt, and show substantial performance improvement in solving object recognition, playing video games and continual learning tasks.", "bibtex": "@InProceedings{pmlr-v162-le22b,\n title = \t {Neurocoder: General-Purpose Computation Using Stored Neural Programs},\n author = {Le, Hung and Venkatesh, Svetha},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12204--12221},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/le22b/le22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/le22b.html},\n abstract = \t {Artificial Neural Networks are functionally equivalent to special-purpose computers. Their inter-neuronal connection weights represent the learnt Neural Program that instructs the networks on how to compute the data. However, without storing Neural Programs, they are restricted to only one, overwriting learnt programs when trained on new data. Here we design Neurocoder, a new class of general-purpose neural networks in which the neural network \u201ccodes\u201d itself in a data-responsive way by composing relevant programs from a set of shareable, modular programs stored in external memory. This time, a Neural Program is efficiently treated as data in memory. Integrating Neurocoder into current neural architectures, we demonstrate new capacity to learn modular programs, reuse simple programs to build complex ones, handle pattern shifts and remember old programs as new ones are learnt, and show substantial performance improvement in solving object recognition, playing video games and continual learning tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/le22b/le22b.pdf", "supp": "", "pdf_size": 2931270, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15839589835275514875&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Applied AI Institute, Deakin University, Geelong, Australia; Applied AI Institute, Deakin University, Geelong, Australia", "aff_domain": "deakin.edu.au; ", "email": "deakin.edu.au; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/le22b.html", "aff_unique_index": "0;0", "aff_unique_norm": "Deakin University", "aff_unique_dep": "Applied AI Institute", "aff_unique_url": "https://www.deakin.edu.au", "aff_unique_abbr": "", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Geelong", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "title": "Neuron Dependency Graphs: A Causal Abstraction of Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17975", "id": "17975", "proceeding": "https://proceedings.mlr.press/v162/hu22b.html", "poster": "/media/PosterPDFs/ICML%202022/7cc234202e98d2722580858573fd0817.png?t=1658178631.839229", "slides": "", "author_site": "Yaojie Hu, Jin Tian", "author": "Yaojie Hu; Jin Tian", "abstract": "We discover that neural networks exhibit approximate logical dependencies among neurons, and we introduce Neuron Dependency Graphs (NDG) that extract and present them as directed graphs. In an NDG, each node corresponds to the boolean activation value of a neuron, and each edge models an approximate logical implication from one node to another. We show that the logical dependencies extracted from the training dataset generalize well to the test set. In addition to providing symbolic explanations to the neural network\u2019s internal structure, NDGs can represent a Structural Causal Model. We empirically show that an NDG is a causal abstraction of the corresponding neural network that \"unfolds\" the same way under causal interventions using the theory by Geiger et al. (2021). Code is available at https://github.com/phimachine/ndg.", "bibtex": "@InProceedings{pmlr-v162-hu22b,\n title = \t {Neuron Dependency Graphs: A Causal Abstraction of Neural Networks},\n author = {Hu, Yaojie and Tian, Jin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9020--9040},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hu22b/hu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/hu22b.html},\n abstract = \t {We discover that neural networks exhibit approximate logical dependencies among neurons, and we introduce Neuron Dependency Graphs (NDG) that extract and present them as directed graphs. In an NDG, each node corresponds to the boolean activation value of a neuron, and each edge models an approximate logical implication from one node to another. We show that the logical dependencies extracted from the training dataset generalize well to the test set. In addition to providing symbolic explanations to the neural network\u2019s internal structure, NDGs can represent a Structural Causal Model. We empirically show that an NDG is a causal abstraction of the corresponding neural network that \"unfolds\" the same way under causal interventions using the theory by Geiger et al. (2021). Code is available at https://github.com/phimachine/ndg.}\n}", "pdf": "https://proceedings.mlr.press/v162/hu22b/hu22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/hu22b-supp.zip", "pdf_size": 3748481, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16526739375767483765&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Department of Computer Science, Iowa State University, United States; Department of Computer Science, Iowa State University, United States", "aff_domain": "iastate.edu; ", "email": "iastate.edu; ", "github": "https://github.com/phimachine/ndg", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/hu22b.html", "aff_unique_index": "0;0", "aff_unique_norm": "Iowa State University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.iastate.edu", "aff_unique_abbr": "ISU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Neurotoxin: Durable Backdoors in Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18207", "id": "18207", "proceeding": "https://proceedings.mlr.press/v162/zhang22w.html", "poster": "/media/PosterPDFs/ICML%202022/fe7ee8fc1959cc7214fa21c4840dff0a.png?t=1658207841.2489343", "slides": "", "author_site": "Zhengming Zhang, Ashwinee Panda, Linyue Song, Yaoqing Yang, Michael Mahoney, Prateek Mittal, Kannan Ramchandran, Joseph E Gonzalez", "author": "Zhengming Zhang; Ashwinee Panda; Linyue Song; Yaoqing Yang; Michael Mahoney; Prateek Mittal; Ramchandran Kannan; Joseph Gonzalez", "abstract": "Federated learning (FL) systems have an inherent vulnerability to adversarial backdoor attacks during training due to their decentralized nature. The goal of the attacker is to implant backdoors in the learned model with poisoned updates such that at test time, the model\u2019s outputs can be fixed to a given target for certain inputs (e.g., if a user types \u201cpeople from New York\u201d into a mobile keyboard app that uses a backdoored next word prediction model, the model will autocomplete their sentence to \u201cpeople in New York are rude\u201d). Prior work has shown that backdoors can be inserted in FL, but these backdoors are not durable: they do not remain in the model after the attacker stops uploading poisoned updates because training continues, and in production FL systems an inserted backdoor may not survive until deployment. We propose Neurotoxin, a simple one-line backdoor attack that functions by attacking parameters that are changed less in magnitude during training. We conduct an exhaustive evaluation across ten natural language processing and computer vision tasks and find that we can double the durability of state of the art backdoors by adding a single line with Neurotoxin.", "bibtex": "@InProceedings{pmlr-v162-zhang22w,\n title = \t {Neurotoxin: Durable Backdoors in Federated Learning},\n author = {Zhang, Zhengming and Panda, Ashwinee and Song, Linyue and Yang, Yaoqing and Mahoney, Michael and Mittal, Prateek and Kannan, Ramchandran and Gonzalez, Joseph},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26429--26446},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22w/zhang22w.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22w.html},\n abstract = \t {Federated learning (FL) systems have an inherent vulnerability to adversarial backdoor attacks during training due to their decentralized nature. The goal of the attacker is to implant backdoors in the learned model with poisoned updates such that at test time, the model\u2019s outputs can be fixed to a given target for certain inputs (e.g., if a user types \u201cpeople from New York\u201d into a mobile keyboard app that uses a backdoored next word prediction model, the model will autocomplete their sentence to \u201cpeople in New York are rude\u201d). Prior work has shown that backdoors can be inserted in FL, but these backdoors are not durable: they do not remain in the model after the attacker stops uploading poisoned updates because training continues, and in production FL systems an inserted backdoor may not survive until deployment. We propose Neurotoxin, a simple one-line backdoor attack that functions by attacking parameters that are changed less in magnitude during training. We conduct an exhaustive evaluation across ten natural language processing and computer vision tasks and find that we can double the durability of state of the art backdoors by adding a single line with Neurotoxin.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22w/zhang22w.pdf", "supp": "", "pdf_size": 1077099, "gs_citation": 189, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15130248935781363426&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "School of Information Science and Engineering, Southeast University, China; Department of Electrical and Computer Engineering, Princeton University; Department of Electrical Engineering and Computer Sciences, University of California at Berkeley; Department of Electrical Engineering and Computer Sciences, University of California at Berkeley; International Computer Science Institute and Department of Statistics, University of California at Berkeley; Department of Electrical Engineering and Computer Sciences, University of California at Berkeley; Department of Electrical Engineering and Computer Sciences, University of California at Berkeley; Department of Electrical and Computer Engineering, Princeton University", "aff_domain": "princeton.edu; ; ; ; ; ; ; ", "email": "princeton.edu; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/zhang22w.html", "aff_unique_index": "0;1;2;2;2;2;2;1", "aff_unique_norm": "Southeast University;Princeton University;University of California, Berkeley", "aff_unique_dep": "School of Information Science and Engineering;Department of Electrical and Computer Engineering;Department of Electrical Engineering and Computer Sciences", "aff_unique_url": "https://www.seu.edu.cn/;https://www.princeton.edu;https://www.berkeley.edu", "aff_unique_abbr": ";Princeton;UC Berkeley", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;1;1;1;1;1;1", "aff_country_unique": "China;United States" }, { "title": "No-Regret Learning in Partially-Informed Auctions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16797", "id": "16797", "proceeding": "https://proceedings.mlr.press/v162/guo22b.html", "poster": "/media/PosterPDFs/ICML%202022/cd17d3ce3b64f227987cd92cd701cc58.png?t=1657514764.279263", "slides": "", "author_site": "Wenshuo Guo, Michael Jordan, Ellen Vitercik", "author": "Wenshuo Guo; Michael Jordan; Ellen Vitercik", "abstract": "Auctions with partially-revealed information about items are broadly employed in real-world applications, but the underlying mechanisms have limited theoretical support. In this work, we study a machine learning formulation of these types of mechanisms, presenting algorithms that are no-regret from the buyer\u2019s perspective. Specifically, a buyer who wishes to maximize his utility interacts repeatedly with a platform over a series of $T$ rounds. In each round, a new item is drawn from an unknown distribution and the platform publishes a price together with incomplete, \u201cmasked\u201d information about the item. The buyer then decides whether to purchase the item. We formalize this problem as an online learning task where the goal is to have low regret with respect to a myopic oracle that has perfect knowledge of the distribution over items and the seller\u2019s masking function. When the distribution over items is known to the buyer and the mask is a SimHash function mapping $\\R^d$ to $\\{0,1\\}^{\\ell}$, our algorithm has regret $\\tilde \\cO((Td\\ell)^{\\nicefrac{1}{2}})$. In a fully agnostic setting when the mask is an arbitrary function mapping to a set of size $n$ and the prices are stochastic, our algorithm has regret $\\tilde \\cO((Tn)^{\\nicefrac{1}{2}})$.", "bibtex": "@InProceedings{pmlr-v162-guo22b,\n title = \t {No-Regret Learning in Partially-Informed Auctions},\n author = {Guo, Wenshuo and Jordan, Michael and Vitercik, Ellen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8039--8055},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guo22b/guo22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/guo22b.html},\n abstract = \t {Auctions with partially-revealed information about items are broadly employed in real-world applications, but the underlying mechanisms have limited theoretical support. In this work, we study a machine learning formulation of these types of mechanisms, presenting algorithms that are no-regret from the buyer\u2019s perspective. Specifically, a buyer who wishes to maximize his utility interacts repeatedly with a platform over a series of $T$ rounds. In each round, a new item is drawn from an unknown distribution and the platform publishes a price together with incomplete, \u201cmasked\u201d information about the item. The buyer then decides whether to purchase the item. We formalize this problem as an online learning task where the goal is to have low regret with respect to a myopic oracle that has perfect knowledge of the distribution over items and the seller\u2019s masking function. When the distribution over items is known to the buyer and the mask is a SimHash function mapping $\\R^d$ to $\\{0,1\\}^{\\ell}$, our algorithm has regret $\\tilde \\cO((Td\\ell)^{\\nicefrac{1}{2}})$. In a fully agnostic setting when the mask is an arbitrary function mapping to a set of size $n$ and the prices are stochastic, our algorithm has regret $\\tilde \\cO((Tn)^{\\nicefrac{1}{2}})$.}\n}", "pdf": "https://proceedings.mlr.press/v162/guo22b/guo22b.pdf", "supp": "", "pdf_size": 386567, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4827729027319371372&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Electrical Engineering & Computer Sciences, University of California, Berkeley, USA+Department of Statistics, University of California, Berkeley, USA; Department of Electrical Engineering & Computer Sciences, University of California, Berkeley, USA+Department of Statistics, University of California, Berkeley, USA; Department of Management Science & Engineering and Department of Computer Science, Stanford University, USA", "aff_domain": "cs.berkeley.edu; ; ", "email": "cs.berkeley.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/guo22b.html", "aff_unique_index": "0+0;0+0;1", "aff_unique_norm": "University of California, Berkeley;Stanford University", "aff_unique_dep": "Department of Electrical Engineering & Computer Sciences;Department of Management Science & Engineering", "aff_unique_url": "https://www.berkeley.edu;https://www.stanford.edu", "aff_unique_abbr": "UC Berkeley;Stanford", "aff_campus_unique_index": "0+0;0+0;1", "aff_campus_unique": "Berkeley;Stanford", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "No-Regret Learning in Time-Varying Zero-Sum Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16315", "id": "16315", "proceeding": "https://proceedings.mlr.press/v162/zhang22an.html", "poster": "/media/PosterPDFs/ICML%202022/4928e7510f45da6575b04a28519c09ed.png?t=1657500686.448357", "slides": "", "author_site": "Mengxiao Zhang, Peng Zhao, Haipeng Luo, Zhi-Hua Zhou", "author": "Mengxiao Zhang; Peng Zhao; Haipeng Luo; Zhi-Hua Zhou", "abstract": "Learning from repeated play in a fixed two-player zero-sum game is a classic problem in game theory and online learning. We consider a variant of this problem where the game payoff matrix changes over time, possibly in an adversarial manner. We first present three performance measures to guide the algorithmic design for this problem: 1) the well-studied", "bibtex": "@InProceedings{pmlr-v162-zhang22an,\n title = \t {No-Regret Learning in Time-Varying Zero-Sum Games},\n author = {Zhang, Mengxiao and Zhao, Peng and Luo, Haipeng and Zhou, Zhi-Hua},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26772--26808},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22an/zhang22an.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22an.html},\n abstract = \t {Learning from repeated play in a fixed two-player zero-sum game is a classic problem in game theory and online learning. We consider a variant of this problem where the game payoff matrix changes over time, possibly in an adversarial manner. We first present three performance measures to guide the algorithmic design for this problem: 1) the well-studied", "pdf": "https://proceedings.mlr.press/v162/zhang22an/zhang22an.pdf", "supp": "", "pdf_size": 539716, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9891299001083890434&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "University of Southern California; National Key Laboratory for Novel Software Technology, Nanjing University; University of Southern California; National Key Laboratory for Novel Software Technology, Nanjing University", "aff_domain": "usc.edu;lamda.nju.edu.cn; ; ", "email": "usc.edu;lamda.nju.edu.cn; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhang22an.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of Southern California;Nanjing University", "aff_unique_dep": ";National Key Laboratory for Novel Software Technology", "aff_unique_url": "https://www.usc.edu;http://www.nju.edu.cn", "aff_unique_abbr": "USC;Nanjing University", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United States;China" }, { "title": "Non-Vacuous Generalisation Bounds for Shallow Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17947", "id": "17947", "proceeding": "https://proceedings.mlr.press/v162/biggs22a.html", "poster": "/media/PosterPDFs/ICML%202022/194cf6c2de8e00c05fcf16c498adc7bf.png?t=1657821788.0485828", "slides": "", "author_site": "Feix Biggs, Benjamin Guedj", "author": "Felix Biggs; Benjamin Guedj", "abstract": "We focus on a specific class of shallow neural networks with a single hidden layer, namely those with $L_2$-normalised data and either a sigmoid-shaped Gaussian error function (\u201cerf\u201d) activation or a Gaussian Error Linear Unit (GELU) activation. For these networks, we derive new generalisation bounds through the PAC-Bayesian theory; unlike most existing such bounds they apply to neural networks with deterministic rather than randomised parameters. Our bounds are empirically non-vacuous when the network is trained with vanilla stochastic gradient descent on MNIST and Fashion-MNIST.", "bibtex": "@InProceedings{pmlr-v162-biggs22a,\n title = \t {Non-Vacuous Generalisation Bounds for Shallow Neural Networks},\n author = {Biggs, Felix and Guedj, Benjamin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1963--1981},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/biggs22a/biggs22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/biggs22a.html},\n abstract = \t {We focus on a specific class of shallow neural networks with a single hidden layer, namely those with $L_2$-normalised data and either a sigmoid-shaped Gaussian error function (\u201cerf\u201d) activation or a Gaussian Error Linear Unit (GELU) activation. For these networks, we derive new generalisation bounds through the PAC-Bayesian theory; unlike most existing such bounds they apply to neural networks with deterministic rather than randomised parameters. Our bounds are empirically non-vacuous when the network is trained with vanilla stochastic gradient descent on MNIST and Fashion-MNIST.}\n}", "pdf": "https://proceedings.mlr.press/v162/biggs22a/biggs22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/biggs22a-supp.zip", "pdf_size": 562710, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11560382540049939968&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Centre for Artificial Intelligence and Department of Computer Science, University College London and Inria London, UK; Centre for Artificial Intelligence and Department of Computer Science, University College London and Inria London, UK", "aff_domain": "felixbiggs.com;ucl.ac.uk", "email": "felixbiggs.com;ucl.ac.uk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/biggs22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University College London", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ucl.ac.uk", "aff_unique_abbr": "UCL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Nonlinear Feature Diffusion on Hypergraphs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16189", "id": "16189", "proceeding": "https://proceedings.mlr.press/v162/prokopchik22a.html", "poster": "/media/PosterPDFs/ICML%202022/90b8e8eca90756905bf80c293ae6a50a.png?t=1658081459.43455", "slides": "", "author_site": "Konstantin Prokopchik, Austin Benson, Francesco Tudisco", "author": "Konstantin Prokopchik; Austin R Benson; Francesco Tudisco", "abstract": "Hypergraphs are a common model for multiway relationships in data, and hypergraph semi-supervised learning is the problem of assigning labels to all nodes in a hypergraph, given labels on just a few nodes. Diffusions and label spreading are classical techniques for semi-supervised learning in the graph setting, and there are some standard ways to extend them to hypergraphs. However, these methods are linear models, and do not offer an obvious way of incorporating node features for making predictions. Here, we develop a nonlinear diffusion process on hypergraphs that spreads both features and labels following the hypergraph structure. Even though the process is nonlinear, we show global convergence to a unique limiting point for a broad class of nonlinearities and we show that such limit is the global minimum of a new regularized semi-supervised learning loss function which aims at reducing a generalized form of variance of the nodes across the hyperedges. The limiting point serves as a node embedding from which we make predictions with a linear model. Our approach is competitive with state-of-the-art graph and hypergraph neural networks, and also takes less time to train.", "bibtex": "@InProceedings{pmlr-v162-prokopchik22a,\n title = \t {Nonlinear Feature Diffusion on Hypergraphs},\n author = {Prokopchik, Konstantin and Benson, Austin R and Tudisco, Francesco},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17945--17958},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/prokopchik22a/prokopchik22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/prokopchik22a.html},\n abstract = \t {Hypergraphs are a common model for multiway relationships in data, and hypergraph semi-supervised learning is the problem of assigning labels to all nodes in a hypergraph, given labels on just a few nodes. Diffusions and label spreading are classical techniques for semi-supervised learning in the graph setting, and there are some standard ways to extend them to hypergraphs. However, these methods are linear models, and do not offer an obvious way of incorporating node features for making predictions. Here, we develop a nonlinear diffusion process on hypergraphs that spreads both features and labels following the hypergraph structure. Even though the process is nonlinear, we show global convergence to a unique limiting point for a broad class of nonlinearities and we show that such limit is the global minimum of a new regularized semi-supervised learning loss function which aims at reducing a generalized form of variance of the nodes across the hyperedges. The limiting point serves as a node embedding from which we make predictions with a linear model. Our approach is competitive with state-of-the-art graph and hypergraph neural networks, and also takes less time to train.}\n}", "pdf": "https://proceedings.mlr.press/v162/prokopchik22a/prokopchik22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/prokopchik22a-supp.zip", "pdf_size": 418935, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12229680441657777933&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Gran Sasso Science Institute, L\u2019Aquila, Italy+Cornell University, New York, USA; Cornell University, New York, USA; Gran Sasso Science Institute, L\u2019Aquila, Italy+Cornell University, New York, USA", "aff_domain": "gssi.it;cs.cornell.edu;gssi.it", "email": "gssi.it;cs.cornell.edu;gssi.it", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/prokopchik22a.html", "aff_unique_index": "0+1;1;0+1", "aff_unique_norm": "Gran Sasso Science Institute;Cornell University", "aff_unique_dep": ";", "aff_unique_url": "https://www.gssi.it;https://www.cornell.edu", "aff_unique_abbr": ";Cornell", "aff_campus_unique_index": "0+1;1;0+1", "aff_campus_unique": "L\u2019Aquila;Ithaca", "aff_country_unique_index": "0+1;1;0+1", "aff_country_unique": "Italy;United States" }, { "title": "Nonparametric Embeddings of Sparse High-Order Interaction Events", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16049", "id": "16049", "proceeding": "https://proceedings.mlr.press/v162/wang22ah.html", "poster": "/media/PosterPDFs/ICML%202022/bffc98347ee35b3ead06728d6f073c68_pji1ggV.png?t=1657477830.56123", "slides": "", "author_site": "Zheng Wang, Yiming Xu, Conor Tillinghast, Shibo Li, Akil Narayan, Shandian Zhe", "author": "Zheng Wang; Yiming Xu; Conor Tillinghast; Shibo Li; Akil Narayan; Shandian Zhe", "abstract": "High-order interaction events are common in real-world applications. Learning embeddings that encode the complex relationships of the participants from these events is of great importance in knowledge mining and predictive tasks. Despite the success of existing approaches, e.g. Poisson tensor factorization, they ignore the sparse structure underlying the data, namely the occurred interactions are far less than the possible interactions among all the participants. In this paper, we propose Nonparametric Embeddings of Sparse High-order interaction events (NESH). We hybridize a sparse hypergraph (tensor) process and a matrix Gaussian process to capture both the asymptotic structural sparsity within the interactions and nonlinear temporal relationships between the participants. We prove strong asymptotic bounds (including both a lower and an upper bound ) of the sparse ratio, which reveals the asymptotic properties of the sampled structure. We use batch-normalization, stick-breaking construction and sparse variational GP approximations to develop an efficient, scalable model inference algorithm. We demonstrate the advantage of our approach in several real-world applications.", "bibtex": "@InProceedings{pmlr-v162-wang22ah,\n title = \t {Nonparametric Embeddings of Sparse High-Order Interaction Events},\n author = {Wang, Zheng and Xu, Yiming and Tillinghast, Conor and Li, Shibo and Narayan, Akil and Zhe, Shandian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23237--23253},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22ah/wang22ah.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22ah.html},\n abstract = \t {High-order interaction events are common in real-world applications. Learning embeddings that encode the complex relationships of the participants from these events is of great importance in knowledge mining and predictive tasks. Despite the success of existing approaches, e.g. Poisson tensor factorization, they ignore the sparse structure underlying the data, namely the occurred interactions are far less than the possible interactions among all the participants. In this paper, we propose Nonparametric Embeddings of Sparse High-order interaction events (NESH). We hybridize a sparse hypergraph (tensor) process and a matrix Gaussian process to capture both the asymptotic structural sparsity within the interactions and nonlinear temporal relationships between the participants. We prove strong asymptotic bounds (including both a lower and an upper bound ) of the sparse ratio, which reveals the asymptotic properties of the sampled structure. We use batch-normalization, stick-breaking construction and sparse variational GP approximations to develop an efficient, scalable model inference algorithm. We demonstrate the advantage of our approach in several real-world applications.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22ah/wang22ah.pdf", "supp": "", "pdf_size": 618548, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14925526712452321478&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "School of Computing, University of Utah; Department of Mathematics, University of Utah; Department of Mathematics, University of Utah; School of Computing, University of Utah; Department of Mathematics, University of Utah+Scientific Computing and Imaging (SCI) Institute, University of Utah; School of Computing, University of Utah", "aff_domain": "cs.utah.edu; ; ; ; ;cs.utah.edu", "email": "cs.utah.edu; ; ; ; ;cs.utah.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wang22ah.html", "aff_unique_index": "0;0;0;0;0+0;0", "aff_unique_norm": "University of Utah", "aff_unique_dep": "School of Computing", "aff_unique_url": "https://www.utah.edu", "aff_unique_abbr": "U of U", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Utah;;Salt Lake City", "aff_country_unique_index": "0;0;0;0;0+0;0", "aff_country_unique": "United States" }, { "title": "Nonparametric Factor Trajectory Learning for Dynamic Tensor Decomposition", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16045", "id": "16045", "proceeding": "https://proceedings.mlr.press/v162/wang22ar.html", "poster": "/media/PosterPDFs/ICML%202022/c9f95a0a5af052bffce5c89917335f67.png?t=1657477200.2675006", "slides": "", "author_site": "Zheng Wang, Shandian Zhe", "author": "Zheng Wang; Shandian Zhe", "abstract": "Tensor decomposition is a fundamental framework to analyze data that can be represented by multi-dimensional arrays. In practice, tensor data are often accompanied with temporal information, namely the time points when the entry values were generated. This information implies abundant, complex temporal variation patterns. However, current methods always assume the factor representations of the entities in each tensor mode are static, and never consider their temporal evolution. To fill this gap, we propose NONparametric FActor Trajectory learning for dynamic tensor decomposition (NONFAT). We place Gaussian process (GP) priors in the frequency domain and conduct inverse Fourier transform via Gauss-Laguerre quadrature to sample the trajectory functions. In this way, we can overcome data sparsity and obtain robust trajectory estimates across long time horizons. Given the trajectory values at specific time points, we use a second-level GP to sample the entry values and to capture the temporal relationship between the entities. For efficient and scalable inference, we leverage the matrix Gaussian structure in the model, introduce a matrix Gaussian posterior, and develop a nested sparse variational learning algorithm. We have shown the advantage of our method in several real-world applications.", "bibtex": "@InProceedings{pmlr-v162-wang22ar,\n title = \t {Nonparametric Factor Trajectory Learning for Dynamic Tensor Decomposition},\n author = {Wang, Zheng and Zhe, Shandian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23459--23469},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22ar/wang22ar.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22ar.html},\n abstract = \t {Tensor decomposition is a fundamental framework to analyze data that can be represented by multi-dimensional arrays. In practice, tensor data are often accompanied with temporal information, namely the time points when the entry values were generated. This information implies abundant, complex temporal variation patterns. However, current methods always assume the factor representations of the entities in each tensor mode are static, and never consider their temporal evolution. To fill this gap, we propose NONparametric FActor Trajectory learning for dynamic tensor decomposition (NONFAT). We place Gaussian process (GP) priors in the frequency domain and conduct inverse Fourier transform via Gauss-Laguerre quadrature to sample the trajectory functions. In this way, we can overcome data sparsity and obtain robust trajectory estimates across long time horizons. Given the trajectory values at specific time points, we use a second-level GP to sample the entry values and to capture the temporal relationship between the entities. For efficient and scalable inference, we leverage the matrix Gaussian structure in the model, introduce a matrix Gaussian posterior, and develop a nested sparse variational learning algorithm. We have shown the advantage of our method in several real-world applications.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22ar/wang22ar.pdf", "supp": "", "pdf_size": 1781912, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15933388034408305292&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "School of Computing, University of Utah; School of Computing, University of Utah", "aff_domain": "cs.utah.edu;cs.utah.edu", "email": "cs.utah.edu;cs.utah.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/wang22ar.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Utah", "aff_unique_dep": "School of Computing", "aff_unique_url": "https://www.utah.edu", "aff_unique_abbr": "U of U", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Utah", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Nonparametric Involutive Markov Chain Monte Carlo", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17861", "id": "17861", "proceeding": "https://proceedings.mlr.press/v162/mak22a.html", "poster": "", "slides": "", "author_site": "Carol Mak, Fabian Zaiser, Luke Ong", "author": "Carol Mak; Fabian Zaiser; Luke Ong", "abstract": "A challenging problem in probabilistic programming is to develop inference algorithms that work for arbitrary programs in a universal probabilistic programming language (PPL). We present the nonparametric involutive Markov chain Monte Carlo (NP-iMCMC) algorithm as a method for constructing MCMC inference algorithms for nonparametric models expressible in universal PPLs. Building on the unifying involutive MCMC framework, and by providing a general procedure for driving state movement between dimensions, we show that NP-iMCMC can generalise numerous existing iMCMC algorithms to work on nonparametric models. We prove the correctness of the NP-iMCMC sampler. Our empirical study shows that the existing strengths of several iMCMC algorithms carry over to their nonparametric extensions. Applying our method to the recently proposed Nonparametric HMC, an instance of (Multiple Step) NP-iMCMC, we have constructed several nonparametric extensions (all of which new) that exhibit significant performance improvements.", "bibtex": "@InProceedings{pmlr-v162-mak22a,\n title = \t {Nonparametric Involutive {M}arkov Chain {M}onte {C}arlo},\n author = {Mak, Carol and Zaiser, Fabian and Ong, Luke},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14802--14859},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mak22a/mak22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mak22a.html},\n abstract = \t {A challenging problem in probabilistic programming is to develop inference algorithms that work for arbitrary programs in a universal probabilistic programming language (PPL). We present the nonparametric involutive Markov chain Monte Carlo (NP-iMCMC) algorithm as a method for constructing MCMC inference algorithms for nonparametric models expressible in universal PPLs. Building on the unifying involutive MCMC framework, and by providing a general procedure for driving state movement between dimensions, we show that NP-iMCMC can generalise numerous existing iMCMC algorithms to work on nonparametric models. We prove the correctness of the NP-iMCMC sampler. Our empirical study shows that the existing strengths of several iMCMC algorithms carry over to their nonparametric extensions. Applying our method to the recently proposed Nonparametric HMC, an instance of (Multiple Step) NP-iMCMC, we have constructed several nonparametric extensions (all of which new) that exhibit significant performance improvements.}\n}", "pdf": "https://proceedings.mlr.press/v162/mak22a/mak22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/mak22a-supp.zip", "pdf_size": 1931978, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17862750245568901583&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, University of Oxford, United Kingdom; Department of Computer Science, University of Oxford, United Kingdom; Department of Computer Science, University of Oxford, United Kingdom", "aff_domain": "cs.ox.ac.uk; ; ", "email": "cs.ox.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/mak22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Nonparametric Sparse Tensor Factorization with Hierarchical Gamma Processes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16053", "id": "16053", "proceeding": "https://proceedings.mlr.press/v162/tillinghast22a.html", "poster": "/media/PosterPDFs/ICML%202022/8fdd149fcaa7058caccc9c4ad5b0d89a.png?t=1657997317.7681968", "slides": "", "author_site": "Conor Tillinghast, Zheng Wang, Shandian Zhe", "author": "Conor Tillinghast; Zheng Wang; Shandian Zhe", "abstract": "We propose a nonparametric factorization approach for sparsely observed tensors. The sparsity does not mean zero-valued entries are massive or dominated. Rather, it implies the observed entries are very few, and even fewer with the growth of the tensor; this is ubiquitous in practice. Compared with the existent works, our model not only leverages the structural information underlying the observed entry indices, but also provides extra interpretability and flexibility {\u2014} it can simultaneously estimate a set of location factors about the intrinsic properties of the tensor nodes, and another set of sociability factors reflecting their extrovert activity in interacting with others; users are free to choose a trade-off between the two types of factors. Specifically, we use hierarchical Gamma processes and Poisson random measures to construct a tensor-valued process, which can freely sample the two types of factors to generate tensors and always guarantees an asymptotic sparsity. We then normalize the tensor process to obtain hierarchical Dirichlet processes to sample each observed entry index, and use a Gaussian process to sample the entry value as a nonlinear function of the factors, so as to capture both the sparse structure properties and complex node relationships. For efficient inference, we use Dirichlet process properties over finite sample partitions, density transformations, and random features to develop a stochastic variational estimation algorithm. We demonstrate the advantage of our method in several benchmark datasets.", "bibtex": "@InProceedings{pmlr-v162-tillinghast22a,\n title = \t {Nonparametric Sparse Tensor Factorization with Hierarchical Gamma Processes},\n author = {Tillinghast, Conor and Wang, Zheng and Zhe, Shandian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21432--21448},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tillinghast22a/tillinghast22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tillinghast22a.html},\n abstract = \t {We propose a nonparametric factorization approach for sparsely observed tensors. The sparsity does not mean zero-valued entries are massive or dominated. Rather, it implies the observed entries are very few, and even fewer with the growth of the tensor; this is ubiquitous in practice. Compared with the existent works, our model not only leverages the structural information underlying the observed entry indices, but also provides extra interpretability and flexibility {\u2014} it can simultaneously estimate a set of location factors about the intrinsic properties of the tensor nodes, and another set of sociability factors reflecting their extrovert activity in interacting with others; users are free to choose a trade-off between the two types of factors. Specifically, we use hierarchical Gamma processes and Poisson random measures to construct a tensor-valued process, which can freely sample the two types of factors to generate tensors and always guarantees an asymptotic sparsity. We then normalize the tensor process to obtain hierarchical Dirichlet processes to sample each observed entry index, and use a Gaussian process to sample the entry value as a nonlinear function of the factors, so as to capture both the sparse structure properties and complex node relationships. For efficient inference, we use Dirichlet process properties over finite sample partitions, density transformations, and random features to develop a stochastic variational estimation algorithm. We demonstrate the advantage of our method in several benchmark datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/tillinghast22a/tillinghast22a.pdf", "supp": "", "pdf_size": 5830915, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4477329563094962291&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Mathematics, University of Utah; School of Computing, University of Utah; School of Computing, University of Utah", "aff_domain": "math.utah.edu;cs.utah.edu;cs.utah.edu", "email": "math.utah.edu;cs.utah.edu;cs.utah.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/tillinghast22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Utah", "aff_unique_dep": "Department of Mathematics", "aff_unique_url": "https://www.utah.edu", "aff_unique_abbr": "Utah", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Utah", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Not All Poisons are Created Equal: Robust Training against Data Poisoning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17153", "id": "17153", "proceeding": "https://proceedings.mlr.press/v162/yang22j.html", "poster": "/media/PosterPDFs/ICML%202022/4aadd661908b181d059a117f02fbc9ec_Pv7Pan3.png?t=1657748287.3472733", "slides": "", "author_site": "Yu Yang, Tian Yu Liu, Baharan Mirzasoleiman", "author": "Yu Yang; Tian Yu Liu; Baharan Mirzasoleiman", "abstract": "Data poisoning causes misclassification of test time target examples, by injecting maliciously crafted samples in the training data. Existing defenses are often effective only against a specific type of targeted attack, significantly degrade the generalization performance, or are prohibitive for standard deep learning pipelines. In this work, we propose an efficient defense mechanism that significantly reduces the success rate of various data poisoning attacks, and provides theoretical guarantees for the performance of the model. Targeted attacks work by adding bounded perturbations to a randomly selected subset of training data to match the targets\u2019 gradient or representation. We show that: (i) under bounded perturbations, only a number of poisons can be optimized to have a gradient that is close enough to that of the target and make the attack successful; (ii) such effective poisons move away from their original class and get isolated in the gradient space; (iii) dropping examples in low-density gradient regions during training can successfully eliminate the effective poisons, and guarantees similar training dynamics to that of training on full data. Our extensive experiments show that our method significantly decreases the success rate of state-of-the-art targeted attacks, including Gradient Matching and Bullseye Polytope, and easily scales to large datasets.", "bibtex": "@InProceedings{pmlr-v162-yang22j,\n title = \t {Not All Poisons are Created Equal: Robust Training against Data Poisoning},\n author = {Yang, Yu and Liu, Tian Yu and Mirzasoleiman, Baharan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25154--25165},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22j/yang22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22j.html},\n abstract = \t {Data poisoning causes misclassification of test time target examples, by injecting maliciously crafted samples in the training data. Existing defenses are often effective only against a specific type of targeted attack, significantly degrade the generalization performance, or are prohibitive for standard deep learning pipelines. In this work, we propose an efficient defense mechanism that significantly reduces the success rate of various data poisoning attacks, and provides theoretical guarantees for the performance of the model. Targeted attacks work by adding bounded perturbations to a randomly selected subset of training data to match the targets\u2019 gradient or representation. We show that: (i) under bounded perturbations, only a number of poisons can be optimized to have a gradient that is close enough to that of the target and make the attack successful; (ii) such effective poisons move away from their original class and get isolated in the gradient space; (iii) dropping examples in low-density gradient regions during training can successfully eliminate the effective poisons, and guarantees similar training dynamics to that of training on full data. Our extensive experiments show that our method significantly decreases the success rate of state-of-the-art targeted attacks, including Gradient Matching and Bullseye Polytope, and easily scales to large datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22j/yang22j.pdf", "supp": "", "pdf_size": 912291, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17906905243022517677&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, University of California, Los Angeles, United States; Department of Computer Science, University of California, Los Angeles, United States; Department of Computer Science, University of California, Los Angeles, United States", "aff_domain": "cs.ucla.edu; ; ", "email": "cs.ucla.edu; ; ", "github": "https://github.com/YuYang0901/Epic", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/yang22j.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "NysADMM: faster composite convex optimization via low-rank approximation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16563", "id": "16563", "proceeding": "https://proceedings.mlr.press/v162/zhao22a.html", "poster": "/media/PosterPDFs/ICML%202022/dc16622ddc767e6bc1200fe5df2fbdfb.png?t=1657908906.0173078", "slides": "", "author_site": "Shipu Zhao, Zachary Frangella, Madeleine Udell", "author": "Shipu Zhao; Zachary Frangella; Madeleine Udell", "abstract": "This paper develops a scalable new algorithm, called NysADMM, to minimize a smooth convex loss function with a convex regularizer. NysADMM accelerates the inexact Alternating Direction Method of Multipliers (ADMM) by constructing a preconditioner for the ADMM subproblem from a randomized low-rank Nystr\u00f6m approximation. NysADMM comes with strong theoretical guarantees: it solves the ADMM subproblem in a constant number of iterations when the rank of the Nystr\u00f6m approximation is the effective dimension of the subproblem regularized Gram matrix. In practice, ranks much smaller than the effective dimension can succeed, so NysADMM uses an adaptive strategy to choose the rank that enjoys analogous guarantees. Numerical experiments on real-world datasets demonstrate that NysADMM can solve important applications, such as the lasso, logistic regression, and support vector machines, in half the time (or less) required by standard solvers. The breadth of problems on which NysADMM beats standard solvers is a surprise: it suggests that ADMM is a dominant paradigm for numerical optimization across a wide range of statistical learning problems that are usually solved with bespoke methods.", "bibtex": "@InProceedings{pmlr-v162-zhao22a,\n title = \t {{N}ys{ADMM}: faster composite convex optimization via low-rank approximation},\n author = {Zhao, Shipu and Frangella, Zachary and Udell, Madeleine},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26824--26840},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhao22a/zhao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhao22a.html},\n abstract = \t {This paper develops a scalable new algorithm, called NysADMM, to minimize a smooth convex loss function with a convex regularizer. NysADMM accelerates the inexact Alternating Direction Method of Multipliers (ADMM) by constructing a preconditioner for the ADMM subproblem from a randomized low-rank Nystr\u00f6m approximation. NysADMM comes with strong theoretical guarantees: it solves the ADMM subproblem in a constant number of iterations when the rank of the Nystr\u00f6m approximation is the effective dimension of the subproblem regularized Gram matrix. In practice, ranks much smaller than the effective dimension can succeed, so NysADMM uses an adaptive strategy to choose the rank that enjoys analogous guarantees. Numerical experiments on real-world datasets demonstrate that NysADMM can solve important applications, such as the lasso, logistic regression, and support vector machines, in half the time (or less) required by standard solvers. The breadth of problems on which NysADMM beats standard solvers is a surprise: it suggests that ADMM is a dominant paradigm for numerical optimization across a wide range of statistical learning problems that are usually solved with bespoke methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhao22a/zhao22a.pdf", "supp": "", "pdf_size": 464793, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4485606110048195518&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Cornell University, Ithaca, NY, USA; Stanford University, Stanford, CA, USA; Stanford University, Stanford, CA, USA", "aff_domain": "cornell.edu; ; ", "email": "cornell.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhao22a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Cornell University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cornell.edu;https://www.stanford.edu", "aff_unique_abbr": "Cornell;Stanford", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Ithaca;Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Nystr\u00f6m Kernel Mean Embeddings", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17771", "id": "17771", "proceeding": "https://proceedings.mlr.press/v162/chatalic22a.html", "poster": "/media/PosterPDFs/ICML%202022/6bb56208f672af0dd65451f869fedfd9.png?t=1657179435.751628", "slides": "", "author_site": "Antoine Chatalic, Nicolas Schreuder, Lorenzo Rosasco, Alessandro Rudi", "author": "Antoine Chatalic; Nicolas Schreuder; Lorenzo Rosasco; Alessandro Rudi", "abstract": "Kernel mean embeddings are a powerful tool to represent probability distributions over arbitrary spaces as single points in a Hilbert space. Yet, the cost of computing and storing such embeddings prohibits their direct use in large-scale settings. We propose an efficient approximation procedure based on the Nystr{\u00f6}m method, which exploits a small random subset of the dataset. Our main result is an upper bound on the approximation error of this procedure. It yields sufficient conditions on the subsample size to obtain the standard (1/sqrt(n)) rate while reducing computational costs. We discuss applications of this result for the approximation of the maximum mean discrepancy and quadrature rules, and we illustrate our theoretical findings with numerical experiments.", "bibtex": "@InProceedings{pmlr-v162-chatalic22a,\n title = \t {{N}ystr{\u00f6}m Kernel Mean Embeddings},\n author = {Chatalic, Antoine and Schreuder, Nicolas and Rosasco, Lorenzo and Rudi, Alessandro},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3006--3024},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chatalic22a/chatalic22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chatalic22a.html},\n abstract = \t {Kernel mean embeddings are a powerful tool to represent probability distributions over arbitrary spaces as single points in a Hilbert space. Yet, the cost of computing and storing such embeddings prohibits their direct use in large-scale settings. We propose an efficient approximation procedure based on the Nystr{\u00f6}m method, which exploits a small random subset of the dataset. Our main result is an upper bound on the approximation error of this procedure. It yields sufficient conditions on the subsample size to obtain the standard (1/sqrt(n)) rate while reducing computational costs. We discuss applications of this result for the approximation of the maximum mean discrepancy and quadrature rules, and we illustrate our theoretical findings with numerical experiments.}\n}", "pdf": "https://proceedings.mlr.press/v162/chatalic22a/chatalic22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chatalic22a-supp.zip", "pdf_size": 438783, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9760794437931960587&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "MaLGA & DIBRIS, Universit `a di Genova; MaLGA & DIBRIS, Universit `a di Genova; Inria, \u00b4Ecole normale sup \u00b4erieure, PSL Research University; MaLGA & DIBRIS, Universit `a di Genova+CBMM, MIT, IIT", "aff_domain": "dibris.unige.it; ; ; ", "email": "dibris.unige.it; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/chatalic22a.html", "aff_unique_index": "0;0;1;0+2", "aff_unique_norm": "Universit\u00e0 di Genova;INRIA;Massachusetts Institute of Technology", "aff_unique_dep": "MaLGA, DIBRIS;;CBMM", "aff_unique_url": "https://www.unige.it;https://www.inria.fr;https://www.mit.edu", "aff_unique_abbr": ";Inria;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;1;0+2", "aff_country_unique": "Italy;France;United States" }, { "title": "OFA: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17835", "id": "17835", "proceeding": "https://proceedings.mlr.press/v162/wang22al.html", "poster": "/media/PosterPDFs/ICML%202022/6917ff2a7b53421ff4066020e2d89eec_ODAb1me.png?t=1656402976.9148357", "slides": "/media/icml-2022/Slides/17835_b3yKPXu.pdf", "author_site": "Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, Hongxia Yang", "author": "Peng Wang; An Yang; Rui Men; Junyang Lin; Shuai Bai; Zhikang Li; Jianxin Ma; Chang Zhou; Jingren Zhou; Hongxia Yang", "abstract": "In this work, we pursue a unified paradigm for multimodal pretraining to break the shackles of complex task/modality-specific customization. We propose OFA, a Task-Agnostic and Modality-Agnostic framework that supports Task Comprehensiveness. OFA unifies a diverse set of cross-modal and unimodal tasks, including image generation, visual grounding, image captioning, image classification, language modeling, etc., in a simple sequence-to-sequence learning framework. OFA follows the instruction-based learning in both pretraining and finetuning stages, requiring no extra task-specific layers for downstream tasks. In comparison with the recent state-of-the-art vision & language models that rely on extremely large cross-modal datasets, OFA is pretrained on only 20M publicly available image-text pairs. Despite its simplicity and relatively small-scale training data, OFA achieves new SOTAs in a series of cross-modal tasks while attaining highly competitive performances on uni-modal tasks. Our further analysis indicates that OFA can also effectively transfer to unseen tasks and unseen domains. Our code and models are publicly available at https://github.com/OFA-Sys/OFA.", "bibtex": "@InProceedings{pmlr-v162-wang22al,\n title = \t {{OFA}: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework},\n author = {Wang, Peng and Yang, An and Men, Rui and Lin, Junyang and Bai, Shuai and Li, Zhikang and Ma, Jianxin and Zhou, Chang and Zhou, Jingren and Yang, Hongxia},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23318--23340},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22al/wang22al.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22al.html},\n abstract = \t {In this work, we pursue a unified paradigm for multimodal pretraining to break the shackles of complex task/modality-specific customization. We propose OFA, a Task-Agnostic and Modality-Agnostic framework that supports Task Comprehensiveness. OFA unifies a diverse set of cross-modal and unimodal tasks, including image generation, visual grounding, image captioning, image classification, language modeling, etc., in a simple sequence-to-sequence learning framework. OFA follows the instruction-based learning in both pretraining and finetuning stages, requiring no extra task-specific layers for downstream tasks. In comparison with the recent state-of-the-art vision & language models that rely on extremely large cross-modal datasets, OFA is pretrained on only 20M publicly available image-text pairs. Despite its simplicity and relatively small-scale training data, OFA achieves new SOTAs in a series of cross-modal tasks while attaining highly competitive performances on uni-modal tasks. Our further analysis indicates that OFA can also effectively transfer to unseen tasks and unseen domains. Our code and models are publicly available at https://github.com/OFA-Sys/OFA.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22al/wang22al.pdf", "supp": "", "pdf_size": 47633292, "gs_citation": 1222, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6454935217962836227&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "DAMO Academy, Alibaba Group, China; DAMO Academy, Alibaba Group, China; DAMO Academy, Alibaba Group, China; DAMO Academy, Alibaba Group, China; DAMO Academy, Alibaba Group, China; DAMO Academy, Alibaba Group, China; DAMO Academy, Alibaba Group, China; DAMO Academy, Alibaba Group, China; DAMO Academy, Alibaba Group, China; DAMO Academy, Alibaba Group, China", "aff_domain": "alibaba-inc.com; ; ; ; ; ; ; ; ; ", "email": "alibaba-inc.com; ; ; ; ; ; ; ; ; ", "github": "https://github.com/OFA-Sys/OFA", "project": "", "author_num": 10, "oa": "https://proceedings.mlr.press/v162/wang22al.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "DAMO Academy", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Object Permanence Emerges in a Random Walk along Memory", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16737", "id": "16737", "proceeding": "https://proceedings.mlr.press/v162/tokmakov22a.html", "poster": "/media/PosterPDFs/ICML%202022/2723d092b63885e0d7c260cc007e8b9d.png?t=1655919997.0708804", "slides": "/media/icml-2022/Slides/16737.pdf", "author_site": "Pavel Tokmakov, Allan Jabri, Jie Li, Adrien Gaidon", "author": "Pavel Tokmakov; Allan Jabri; Jie Li; Adrien Gaidon", "abstract": "This paper proposes a self-supervised objective for learning representations that localize objects under occlusion - a property known as object permanence. A central question is the choice of learning signal in cases of total occlusion. Rather than directly supervising the locations of invisible objects, we propose a self-supervised objective that requires neither human annotation, nor assumptions about object dynamics. We show that object permanence can emerge by optimizing for temporal coherence of memory: we fit a Markov walk along a space-time graph of memories, where the states in each time step are non-Markovian features from a sequence encoder. This leads to a memory representation that stores occluded objects and predicts their motion, to better localize them. The resulting model outperforms existing approaches on several datasets of increasing complexity and realism, despite requiring minimal supervision, and hence being broadly applicable.", "bibtex": "@InProceedings{pmlr-v162-tokmakov22a,\n title = \t {Object Permanence Emerges in a Random Walk along Memory},\n author = {Tokmakov, Pavel and Jabri, Allan and Li, Jie and Gaidon, Adrien},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21506--21519},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tokmakov22a/tokmakov22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tokmakov22a.html},\n abstract = \t {This paper proposes a self-supervised objective for learning representations that localize objects under occlusion - a property known as object permanence. A central question is the choice of learning signal in cases of total occlusion. Rather than directly supervising the locations of invisible objects, we propose a self-supervised objective that requires neither human annotation, nor assumptions about object dynamics. We show that object permanence can emerge by optimizing for temporal coherence of memory: we fit a Markov walk along a space-time graph of memories, where the states in each time step are non-Markovian features from a sequence encoder. This leads to a memory representation that stores occluded objects and predicts their motion, to better localize them. The resulting model outperforms existing approaches on several datasets of increasing complexity and realism, despite requiring minimal supervision, and hence being broadly applicable.}\n}", "pdf": "https://proceedings.mlr.press/v162/tokmakov22a/tokmakov22a.pdf", "supp": "", "pdf_size": 16687517, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10539524770377874867&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Toyota Research Institute; UC Berkeley; Toyota Research Institute; Toyota Research Institute", "aff_domain": "tri.global; ; ; ", "email": "tri.global; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/tokmakov22a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Toyota Research Institute;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.tri.global;https://www.berkeley.edu", "aff_unique_abbr": "TRI;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Off-Policy Evaluation for Large Action Spaces via Embeddings", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16729", "id": "16729", "proceeding": "https://proceedings.mlr.press/v162/saito22a.html", "poster": "/media/PosterPDFs/ICML%202022/6da9003b743b65f4c0ccd295cc484e57.png?t=1657179706.4877508", "slides": "", "author_site": "Yuta Saito, Thorsten Joachims", "author": "Yuta Saito; Thorsten Joachims", "abstract": "Off-policy evaluation (OPE) in contextual bandits has seen rapid adoption in real-world systems, since it enables offline evaluation of new policies using only historic log data. Unfortunately, when the number of actions is large, existing OPE estimators \u2013 most of which are based on inverse propensity score weighting \u2013 degrade severely and can suffer from extreme bias and variance. This foils the use of OPE in many applications from recommender systems to language models. To overcome this issue, we propose a new OPE estimator that leverages marginalized importance weights when action embeddings provide structure in the action space. We characterize the bias, variance, and mean squared error of the proposed estimator and analyze the conditions under which the action embedding provides statistical benefits over conventional estimators. In addition to the theoretical analysis, we find that the empirical performance improvement can be substantial, enabling reliable OPE even when existing estimators collapse due to a large number of actions.", "bibtex": "@InProceedings{pmlr-v162-saito22a,\n title = \t {Off-Policy Evaluation for Large Action Spaces via Embeddings},\n author = {Saito, Yuta and Joachims, Thorsten},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19089--19122},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/saito22a/saito22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/saito22a.html},\n abstract = \t {Off-policy evaluation (OPE) in contextual bandits has seen rapid adoption in real-world systems, since it enables offline evaluation of new policies using only historic log data. Unfortunately, when the number of actions is large, existing OPE estimators \u2013 most of which are based on inverse propensity score weighting \u2013 degrade severely and can suffer from extreme bias and variance. This foils the use of OPE in many applications from recommender systems to language models. To overcome this issue, we propose a new OPE estimator that leverages marginalized importance weights when action embeddings provide structure in the action space. We characterize the bias, variance, and mean squared error of the proposed estimator and analyze the conditions under which the action embedding provides statistical benefits over conventional estimators. In addition to the theoretical analysis, we find that the empirical performance improvement can be substantial, enabling reliable OPE even when existing estimators collapse due to a large number of actions.}\n}", "pdf": "https://proceedings.mlr.press/v162/saito22a/saito22a.pdf", "supp": "", "pdf_size": 9477560, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6352283989670332766&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Cornell University; Department of Computer Science, Cornell University", "aff_domain": "cornell.edu;cs.cornell.edu", "email": "cornell.edu;cs.cornell.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/saito22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Off-Policy Fitted Q-Evaluation with Differentiable Function Approximators: Z-Estimation and Inference Theory", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17759", "id": "17759", "proceeding": "https://proceedings.mlr.press/v162/zhang22al.html", "poster": "", "slides": "", "author_site": "Ruiqi Zhang, Xuezhou Zhang, Chengzhuo Ni, Mengdi Wang", "author": "Ruiqi Zhang; Xuezhou Zhang; Chengzhuo Ni; Mengdi Wang", "abstract": "Off-Policy Evaluation (OPE) serves as one of the cornerstones in Reinforcement Learning (RL). Fitted Q Evaluation (FQE) with various function approximators, especially deep neural networks, has gained practical success. While statistical analysis has proved FQE to be minimax-optimal with tabular, linear and several nonparametric function families, its practical performance with more general function approximator is less theoretically understood. We focus on FQE with general", "bibtex": "@InProceedings{pmlr-v162-zhang22al,\n title = \t {Off-Policy Fitted Q-Evaluation with Differentiable Function Approximators: Z-Estimation and Inference Theory},\n author = {Zhang, Ruiqi and Zhang, Xuezhou and Ni, Chengzhuo and Wang, Mengdi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26713--26749},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22al/zhang22al.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22al.html},\n abstract = \t {Off-Policy Evaluation (OPE) serves as one of the cornerstones in Reinforcement Learning (RL). Fitted Q Evaluation (FQE) with various function approximators, especially deep neural networks, has gained practical success. While statistical analysis has proved FQE to be minimax-optimal with tabular, linear and several nonparametric function families, its practical performance with more general function approximator is less theoretically understood. We focus on FQE with general", "pdf": "https://proceedings.mlr.press/v162/zhang22al/zhang22al.pdf", "supp": "", "pdf_size": 567580, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5074142210258838460&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Mathematical Sciences, Peking University; Department of Electrical Computer Engineering, Princeton University; Department of Electrical Computer Engineering, Princeton University; Department of Electrical Computer Engineering, Princeton University + Deepmind", "aff_domain": "pku.edu.cn;princeton.edu;princeton.edu;princeton.edu", "email": "pku.edu.cn;princeton.edu;princeton.edu;princeton.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhang22al.html", "aff_unique_index": "0;1;1;1+2", "aff_unique_norm": "Peking University;Princeton University;DeepMind", "aff_unique_dep": "School of Mathematical Sciences;Department of Electrical Computer Engineering;", "aff_unique_url": "http://www.pku.edu.cn;https://www.princeton.edu;https://deepmind.com", "aff_unique_abbr": "PKU;Princeton;DeepMind", "aff_campus_unique_index": "0;", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;1;1;1+2", "aff_country_unique": "China;United States;United Kingdom" }, { "title": "Off-Policy Reinforcement Learning with Delayed Rewards", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16161", "id": "16161", "proceeding": "https://proceedings.mlr.press/v162/han22e.html", "poster": "/media/PosterPDFs/ICML%202022/cd81cfd0a3397761fac44ddbe5ec3349.png?t=1657770353.3497524", "slides": "", "author_site": "Beining Han, Zhizhou Ren, Zuofan Wu, Yuan Zhou, Jian Peng", "author": "Beining Han; Zhizhou Ren; Zuofan Wu; Yuan Zhou; Jian Peng", "abstract": "We study deep reinforcement learning (RL) algorithms with delayed rewards. In many real-world tasks, instant rewards are often not readily accessible or even defined immediately after the agent performs actions. In this work, we first formally define the environment with delayed rewards and discuss the challenges raised due to the non-Markovian nature of such environments. Then, we introduce a general off-policy RL framework with a new Q-function formulation that can handle the delayed rewards with theoretical convergence guarantees. For practical tasks with high dimensional state spaces, we further introduce the HC-decomposition rule of the Q-function in our framework which naturally leads to an approximation scheme that helps boost the training efficiency and stability. We finally conduct extensive experiments to demonstrate the superior performance of our algorithms over the existing work and their variants.", "bibtex": "@InProceedings{pmlr-v162-han22e,\n title = \t {Off-Policy Reinforcement Learning with Delayed Rewards},\n author = {Han, Beining and Ren, Zhizhou and Wu, Zuofan and Zhou, Yuan and Peng, Jian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8280--8303},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/han22e/han22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/han22e.html},\n abstract = \t {We study deep reinforcement learning (RL) algorithms with delayed rewards. In many real-world tasks, instant rewards are often not readily accessible or even defined immediately after the agent performs actions. In this work, we first formally define the environment with delayed rewards and discuss the challenges raised due to the non-Markovian nature of such environments. Then, we introduce a general off-policy RL framework with a new Q-function formulation that can handle the delayed rewards with theoretical convergence guarantees. For practical tasks with high dimensional state spaces, we further introduce the HC-decomposition rule of the Q-function in our framework which naturally leads to an approximation scheme that helps boost the training efficiency and stability. We finally conduct extensive experiments to demonstrate the superior performance of our algorithms over the existing work and their variants.}\n}", "pdf": "https://proceedings.mlr.press/v162/han22e/han22e.pdf", "supp": "", "pdf_size": 3132556, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10466467384922349914&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, China+Helixon Limited, Beijing, China+Yau Mathematical Sciences Center, Tsinghua University, Beijing, China+Institute for Industry AI Research, Tsinghua University, Beijing, China; Department of Computer Science, University of Illinois Urbana-Champaign, Illinois, United States+Helixon Limited, Beijing, China+Yau Mathematical Sciences Center, Tsinghua University, Beijing, China+Institute for Industry AI Research, Tsinghua University, Beijing, China; Helixon Limited, Beijing, China+Yau Mathematical Sciences Center, Tsinghua University, Beijing, China+Institute for Industry AI Research, Tsinghua University, Beijing, China; Yau Mathematical Sciences Center, Tsinghua University, Beijing, China; Institute for Industry AI Research, Tsinghua University, Beijing, China", "aff_domain": "illinois.edu; ; ; ;illinois.edu", "email": "illinois.edu; ; ; ;illinois.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/han22e.html", "aff_unique_index": "0+1+0+0;2+1+0+0;1+0+0;0;0", "aff_unique_norm": "Tsinghua University;HeliXon Limited;University of Illinois Urbana-Champaign", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;;Department of Computer Science", "aff_unique_url": "https://www.tsinghua.edu.cn;;https://illinois.edu", "aff_unique_abbr": "Tsinghua;;UIUC", "aff_campus_unique_index": "0+0+0;2+0+0;0+0;0;0", "aff_campus_unique": "Beijing;;Urbana-Champaign", "aff_country_unique_index": "0+0+0+0;1+0+0+0;0+0+0;0;0", "aff_country_unique": "China;United States" }, { "title": "Offline Meta-Reinforcement Learning with Online Self-Supervision", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16151", "id": "16151", "proceeding": "https://proceedings.mlr.press/v162/pong22a.html", "poster": "/media/PosterPDFs/ICML%202022/767c23430487b6c64d45b83d5d32e9a1.png?t=1657518385.0952175", "slides": "", "author_site": "Vitchyr Pong, Ashvin Nair, Laura Smith, Catherine Huang, Sergey Levine", "author": "Vitchyr H Pong; Ashvin V Nair; Laura M Smith; Catherine Huang; Sergey Levine", "abstract": "Meta-reinforcement learning (RL) methods can meta-train policies that adapt to new tasks with orders of magnitude less data than standard RL, but meta-training itself is costly and time-consuming. If we can meta-train on offline data, then we can reuse the same static dataset, labeled once with rewards for different tasks, to meta-train policies that adapt to a variety of new tasks at meta-test time. Although this capability would make meta-RL a practical tool for real-world use, offline meta-RL presents additional challenges beyond online meta-RL or standard offline RL settings. Meta-RL learns an exploration strategy that collects data for adapting, and also meta-trains a policy that quickly adapts to data from a new task. Since this policy was meta-trained on a fixed, offline dataset, it might behave unpredictably when adapting to data collected by the learned exploration strategy, which differs systematically from the offline data and thus induces distributional shift. We propose a hybrid offline meta-RL algorithm, which uses offline data with rewards to meta-train an adaptive policy, and then collects additional unsupervised online data, without any reward labels to bridge this distribution shift. By not requiring reward labels for online collection, this data can be much cheaper to collect. We compare our method to prior work on offline meta-RL on simulated robot locomotion and manipulation tasks and find that using additional unsupervised online data collection leads to a dramatic improvement in the adaptive capabilities of the meta-trained policies, matching the performance of fully online meta-RL on a range of challenging domains that require generalization to new tasks.", "bibtex": "@InProceedings{pmlr-v162-pong22a,\n title = \t {Offline Meta-Reinforcement Learning with Online Self-Supervision},\n author = {Pong, Vitchyr H and Nair, Ashvin V and Smith, Laura M and Huang, Catherine and Levine, Sergey},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17811--17829},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pong22a/pong22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pong22a.html},\n abstract = \t {Meta-reinforcement learning (RL) methods can meta-train policies that adapt to new tasks with orders of magnitude less data than standard RL, but meta-training itself is costly and time-consuming. If we can meta-train on offline data, then we can reuse the same static dataset, labeled once with rewards for different tasks, to meta-train policies that adapt to a variety of new tasks at meta-test time. Although this capability would make meta-RL a practical tool for real-world use, offline meta-RL presents additional challenges beyond online meta-RL or standard offline RL settings. Meta-RL learns an exploration strategy that collects data for adapting, and also meta-trains a policy that quickly adapts to data from a new task. Since this policy was meta-trained on a fixed, offline dataset, it might behave unpredictably when adapting to data collected by the learned exploration strategy, which differs systematically from the offline data and thus induces distributional shift. We propose a hybrid offline meta-RL algorithm, which uses offline data with rewards to meta-train an adaptive policy, and then collects additional unsupervised online data, without any reward labels to bridge this distribution shift. By not requiring reward labels for online collection, this data can be much cheaper to collect. We compare our method to prior work on offline meta-RL on simulated robot locomotion and manipulation tasks and find that using additional unsupervised online data collection leads to a dramatic improvement in the adaptive capabilities of the meta-trained policies, matching the performance of fully online meta-RL on a range of challenging domains that require generalization to new tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/pong22a/pong22a.pdf", "supp": "", "pdf_size": 3394611, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15951748320809875665&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of California, Berkeley; University of California, Berkeley; University of California, Berkeley; University of California, Berkeley; University of California, Berkeley", "aff_domain": "eecs.berkeley.edu; ; ; ; ", "email": "eecs.berkeley.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/pong22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Offline RL Policies Should Be Trained to be Adaptive", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18003", "id": "18003", "proceeding": "https://proceedings.mlr.press/v162/ghosh22a.html", "poster": "/media/PosterPDFs/ICML%202022/d7619beb6eb189509885fbc192d2874b.png?t=1658157046.9368575", "slides": "", "author_site": "Dibya Ghosh, Anurag Ajay, Pulkit Agrawal, Sergey Levine", "author": "Dibya Ghosh; Anurag Ajay; Pulkit Agrawal; Sergey Levine", "abstract": "Offline RL algorithms must account for the fact that the dataset they are provided may leave many facets of the environment unknown. The most common way to approach this challenge is to employ pessimistic or conservative methods, which avoid behaviors that are too dissimilar from those in the training dataset. However, relying exclusively on conservatism has drawbacks: performance is sensitive to the exact degree of conservatism, and conservative objectives can recover highly suboptimal policies. In this work, we propose that offline RL methods should instead be adaptive in the presence of uncertainty. We show that acting optimally in offline RL in a Bayesian sense involves solving an implicit POMDP. As a result, optimal policies for offline RL must be adaptive, depending not just on the current state but rather all the transitions seen so far during evaluation. We present a model-free algorithm for approximating this optimal adaptive policy, and demonstrate the efficacy of learning such adaptive policies in offline RL benchmarks.", "bibtex": "@InProceedings{pmlr-v162-ghosh22a,\n title = \t {Offline {RL} Policies Should Be Trained to be Adaptive},\n author = {Ghosh, Dibya and Ajay, Anurag and Agrawal, Pulkit and Levine, Sergey},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7513--7530},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ghosh22a/ghosh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ghosh22a.html},\n abstract = \t {Offline RL algorithms must account for the fact that the dataset they are provided may leave many facets of the environment unknown. The most common way to approach this challenge is to employ pessimistic or conservative methods, which avoid behaviors that are too dissimilar from those in the training dataset. However, relying exclusively on conservatism has drawbacks: performance is sensitive to the exact degree of conservatism, and conservative objectives can recover highly suboptimal policies. In this work, we propose that offline RL methods should instead be adaptive in the presence of uncertainty. We show that acting optimally in offline RL in a Bayesian sense involves solving an implicit POMDP. As a result, optimal policies for offline RL must be adaptive, depending not just on the current state but rather all the transitions seen so far during evaluation. We present a model-free algorithm for approximating this optimal adaptive policy, and demonstrate the efficacy of learning such adaptive policies in offline RL benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/ghosh22a/ghosh22a.pdf", "supp": "", "pdf_size": 2529265, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5962279150353060035&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "UC Berkeley; MIT; MIT; UC Berkeley", "aff_domain": "berkeley.edu; ; ;berkeley.edu", "email": "berkeley.edu; ; ;berkeley.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/ghosh22a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of California, Berkeley;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://web.mit.edu", "aff_unique_abbr": "UC Berkeley;MIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Omni-Granular Ego-Semantic Propagation for Self-Supervised Graph Representation Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16119", "id": "16119", "proceeding": "https://proceedings.mlr.press/v162/yang22d.html", "poster": "/media/PosterPDFs/ICML%202022/aab3238922bcc25a6f606eb525ffdc56.png?t=1657159217.195775", "slides": "/media/icml-2022/Slides/16119.pdf", "author_site": "Ling Yang, Shenda Hong", "author": "Ling Yang; Shenda Hong", "abstract": "Unsupervised/self-supervised graph representation learning is critical for downstream node- and graph-level classification tasks. Global structure of graphs helps discriminating representations and existing methods mainly utilize the global structure by imposing additional supervisions. However, their global semantics are usually invariant for all nodes/graphs and they fail to explicitly embed the global semantics to enrich the representations. In this paper, we propose Omni-Granular Ego-Semantic Propagation for Self-Supervised Graph Representation Learning (OEPG). Specifically, we introduce instance-adaptive global-aware ego-semantic descriptors, leveraging the first- and second-order feature differences between each node/graph and hierarchical global clusters of the entire graph dataset. The descriptors can be explicitly integrated into local graph convolution as new neighbor nodes. Besides, we design an omni-granular normalization on the whole scales and hierarchies of the ego-semantic to assign attentional weight to each descriptor from an omni-granular perspective. Specialized pretext tasks and cross-iteration momentum update are further developed for local-global mutual adaptation. In downstream tasks, OEPG consistently achieves the best performance with a 2%~6% accuracy gain on multiple datasets cross scales and domains. Notably, OEPG also generalizes to quantity- and topology-imbalance scenarios.", "bibtex": "@InProceedings{pmlr-v162-yang22d,\n title = \t {Omni-Granular Ego-Semantic Propagation for Self-Supervised Graph Representation Learning},\n author = {Yang, Ling and Hong, Shenda},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25022--25037},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22d/yang22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22d.html},\n abstract = \t {Unsupervised/self-supervised graph representation learning is critical for downstream node- and graph-level classification tasks. Global structure of graphs helps discriminating representations and existing methods mainly utilize the global structure by imposing additional supervisions. However, their global semantics are usually invariant for all nodes/graphs and they fail to explicitly embed the global semantics to enrich the representations. In this paper, we propose Omni-Granular Ego-Semantic Propagation for Self-Supervised Graph Representation Learning (OEPG). Specifically, we introduce instance-adaptive global-aware ego-semantic descriptors, leveraging the first- and second-order feature differences between each node/graph and hierarchical global clusters of the entire graph dataset. The descriptors can be explicitly integrated into local graph convolution as new neighbor nodes. Besides, we design an omni-granular normalization on the whole scales and hierarchies of the ego-semantic to assign attentional weight to each descriptor from an omni-granular perspective. Specialized pretext tasks and cross-iteration momentum update are further developed for local-global mutual adaptation. In downstream tasks, OEPG consistently achieves the best performance with a 2%~6% accuracy gain on multiple datasets cross scales and domains. Notably, OEPG also generalizes to quantity- and topology-imbalance scenarios.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22d/yang22d.pdf", "supp": "", "pdf_size": 446806, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11066533014750939467&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "National Institute of Health Data Science, Peking University, Beijing, China+Institute of Medical Technology, Health Science Center of Peking University, Beijing, China; National Institute of Health Data Science, Peking University, Beijing, China+Institute of Medical Technology, Health Science Center of Peking University, Beijing, China", "aff_domain": "163.com;pku.edu.cn", "email": "163.com;pku.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/yang22d.html", "aff_unique_index": "0+0;0+0", "aff_unique_norm": "Peking University", "aff_unique_dep": "National Institute of Health Data Science", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "PKU", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "China" }, { "title": "On Collective Robustness of Bagging Against Data Poisoning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17411", "id": "17411", "proceeding": "https://proceedings.mlr.press/v162/chen22k.html", "poster": "/media/PosterPDFs/ICML%202022/f7177163c833dff4b38fc8d2872f1ec6.png?t=1656606523.4988577", "slides": "/media/icml-2022/Slides/17411.pdf", "author_site": "Ruoxin Chen, Zenan Li, Jie Li, Junchi Yan, Chentao Wu", "author": "Ruoxin Chen; Zenan Li; Jie Li; Junchi Yan; Chentao Wu", "abstract": "Bootstrap aggregating (bagging) is an effective ensemble protocol, which is believed can enhance robustness by its majority voting mechanism. Recent works further prove the sample-wise robustness certificates for certain forms of bagging (e.g. partition aggregation). Beyond these particular forms, in this paper, we propose the first collective certification for general bagging to compute the tight robustness against the global poisoning attack. Specifically, we compute the maximum number of simultaneously changed predictions via solving a binary integer linear programming (BILP) problem. Then we analyze the robustness of vanilla bagging and give the upper bound of the tolerable poison budget. Based on this analysis, we propose hash bagging to improve the robustness of vanilla bagging almost for free. This is achieved by modifying the random subsampling in vanilla bagging to a hash-based deterministic subsampling, as a way of controlling the influence scope for each poisoning sample universally. Our extensive experiments show the notable advantage in terms of applicability and robustness. Our code is available at https://github.com/Emiyalzn/ICML22-CRB.", "bibtex": "@InProceedings{pmlr-v162-chen22k,\n title = \t {On Collective Robustness of Bagging Against Data Poisoning},\n author = {Chen, Ruoxin and Li, Zenan and Li, Jie and Yan, Junchi and Wu, Chentao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3299--3319},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22k/chen22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22k.html},\n abstract = \t {Bootstrap aggregating (bagging) is an effective ensemble protocol, which is believed can enhance robustness by its majority voting mechanism. Recent works further prove the sample-wise robustness certificates for certain forms of bagging (e.g. partition aggregation). Beyond these particular forms, in this paper, we propose the first collective certification for general bagging to compute the tight robustness against the global poisoning attack. Specifically, we compute the maximum number of simultaneously changed predictions via solving a binary integer linear programming (BILP) problem. Then we analyze the robustness of vanilla bagging and give the upper bound of the tolerable poison budget. Based on this analysis, we propose hash bagging to improve the robustness of vanilla bagging almost for free. This is achieved by modifying the random subsampling in vanilla bagging to a hash-based deterministic subsampling, as a way of controlling the influence scope for each poisoning sample universally. Our extensive experiments show the notable advantage in terms of applicability and robustness. Our code is available at https://github.com/Emiyalzn/ICML22-CRB.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22k/chen22k.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chen22k-supp.zip", "pdf_size": 858740, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7671982562316508504&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science and Engineering and MoE Key Lab of Arti\ufb01cial Intelligence, Shanghai Jiao Tong University, Shanghai, China + Shanghai AI Laboratory, Shanghai, China; Department of Computer Science and Engineering and MoE Key Lab of Arti\ufb01cial Intelligence, Shanghai Jiao Tong University, Shanghai, China + Shanghai AI Laboratory, Shanghai, China; Department of Computer Science and Engineering and MoE Key Lab of Arti\ufb01cial Intelligence, Shanghai Jiao Tong University, Shanghai, China + Shanghai AI Laboratory, Shanghai, China; Department of Computer Science and Engineering and MoE Key Lab of Arti\ufb01cial Intelligence, Shanghai Jiao Tong University, Shanghai, China + Shanghai AI Laboratory, Shanghai, China; Department of Computer Science and Engineering and MoE Key Lab of Arti\ufb01cial Intelligence, Shanghai Jiao Tong University, Shanghai, China + Shanghai AI Laboratory, Shanghai, China", "aff_domain": "sjtu.edu.cn; ; ; ; ", "email": "sjtu.edu.cn; ; ; ; ", "github": "https://github.com/Emiyalzn/ICML22-CRB", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/chen22k.html", "aff_unique_index": "0+1;0+1;0+1;0+1;0+1", "aff_unique_norm": "Shanghai Jiao Tong University;Shanghai AI Laboratory", "aff_unique_dep": "Department of Computer Science and Engineering;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.shanghaiailab.com", "aff_unique_abbr": "SJTU;SAIL", "aff_campus_unique_index": "0+0;0+0;0+0;0+0;0+0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", "aff_country_unique": "China" }, { "title": "On Convergence of Gradient Descent Ascent: A Tight Local Analysis", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18079", "id": "18079", "proceeding": "https://proceedings.mlr.press/v162/li22e.html", "poster": "/media/PosterPDFs/ICML%202022/89d3d7800304002cd469f0c402bd3ea0.png?t=1658193761.870599", "slides": "", "author_site": "Haochuan Li, Farzan Farnia, Subhro Das, Ali Jadbabaie", "author": "Haochuan Li; Farzan Farnia; Subhro Das; Ali Jadbabaie", "abstract": "Gradient Descent Ascent (GDA) methods are the mainstream algorithms for minimax optimization in generative adversarial networks (GANs). Convergence properties of GDA have drawn significant interest in the recent literature. Specifically, for $\\min_{x} \\max_{y} f(x;y)$ where $f$ is strongly-concave in $y$ and possibly nonconvex in $x$, (Lin et al., 2020) proved the convergence of GDA with a stepsize ratio $\\eta_y/\\eta_x=\\Theta(\\kappa^2)$ where $\\eta_x$ and $\\eta_y$ are the stepsizes for $x$ and $y$ and $\\kappa$ is the condition number for $y$. While this stepsize ratio suggests a slow training of the min player, practical GAN algorithms typically adopt similar stepsizes for both variables, indicating a wide gap between theoretical and empirical results. In this paper, we aim to bridge this gap by analyzing the", "bibtex": "@InProceedings{pmlr-v162-li22e,\n title = \t {On Convergence of Gradient Descent Ascent: A Tight Local Analysis},\n author = {Li, Haochuan and Farnia, Farzan and Das, Subhro and Jadbabaie, Ali},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12717--12740},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22e/li22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22e.html},\n abstract = \t {Gradient Descent Ascent (GDA) methods are the mainstream algorithms for minimax optimization in generative adversarial networks (GANs). Convergence properties of GDA have drawn significant interest in the recent literature. Specifically, for $\\min_{x} \\max_{y} f(x;y)$ where $f$ is strongly-concave in $y$ and possibly nonconvex in $x$, (Lin et al., 2020) proved the convergence of GDA with a stepsize ratio $\\eta_y/\\eta_x=\\Theta(\\kappa^2)$ where $\\eta_x$ and $\\eta_y$ are the stepsizes for $x$ and $y$ and $\\kappa$ is the condition number for $y$. While this stepsize ratio suggests a slow training of the min player, practical GAN algorithms typically adopt similar stepsizes for both variables, indicating a wide gap between theoretical and empirical results. In this paper, we aim to bridge this gap by analyzing the", "pdf": "https://proceedings.mlr.press/v162/li22e/li22e.pdf", "supp": "", "pdf_size": 911123, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10195595921848174540&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of EECS, Massachusetts Institute of Technology; Department of CSE, The Chinese University of Hong Kong; MIT-IBM Watson AI Lab, IBM Research; Department of CEE, Massachusetts Institute of Technology", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/li22e.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Massachusetts Institute of Technology;Chinese University of Hong Kong;IBM", "aff_unique_dep": "Department of Electrical Engineering and Computer Science;Department of CSE;AI Lab", "aff_unique_url": "https://web.mit.edu;https://www.cuhk.edu.hk;https://www.ibmwatsonai.org/", "aff_unique_abbr": "MIT;CUHK;MIT-IBM AI Lab", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Cambridge;Hong Kong SAR;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "On Distribution Shift in Learning-based Bug Detectors", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18391", "id": "18391", "proceeding": "https://proceedings.mlr.press/v162/he22a.html", "poster": "/media/PosterPDFs/ICML%202022/b5200c6107fc3d41d19a2b66835c3974.png?t=1657529046.7970212", "slides": "", "author_site": "Jingxuan He, Luca Beurer-Kellner, Martin Vechev", "author": "Jingxuan He; Luca Beurer-Kellner; Martin Vechev", "abstract": "Deep learning has recently achieved initial success in program analysis tasks such as bug detection. Lacking real bugs, most existing works construct training and test data by injecting synthetic bugs into correct programs. Despite achieving high test accuracy (e.g., >90%), the resulting bug detectors are found to be surprisingly unusable in practice, i.e., <10% precision when used to scan real software repositories. In this work, we argue that this massive performance difference is caused by a distribution shift, i.e., a fundamental mismatch between the real bug distribution and the synthetic bug distribution used to train and evaluate the detectors. To address this key challenge, we propose to train a bug detector in two phases, first on a synthetic bug distribution to adapt the model to the bug detection domain, and then on a real bug distribution to drive the model towards the real distribution. During these two phases, we leverage a multi-task hierarchy, focal loss, and contrastive learning to further boost performance. We evaluate our approach extensively on three widely studied bug types, for which we construct new datasets carefully designed to capture the real bug distribution. The results demonstrate that our approach is practically effective and successfully mitigates the distribution shift: our learned detectors are highly performant on both our test set and the latest version of open source repositories. Our code, datasets, and models are publicly available at https://github.com/eth-sri/learning-real-bug-detector.", "bibtex": "@InProceedings{pmlr-v162-he22a,\n title = \t {On Distribution Shift in Learning-based Bug Detectors},\n author = {He, Jingxuan and Beurer-Kellner, Luca and Vechev, Martin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8559--8580},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/he22a/he22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/he22a.html},\n abstract = \t {Deep learning has recently achieved initial success in program analysis tasks such as bug detection. Lacking real bugs, most existing works construct training and test data by injecting synthetic bugs into correct programs. Despite achieving high test accuracy (e.g., >90%), the resulting bug detectors are found to be surprisingly unusable in practice, i.e., <10% precision when used to scan real software repositories. In this work, we argue that this massive performance difference is caused by a distribution shift, i.e., a fundamental mismatch between the real bug distribution and the synthetic bug distribution used to train and evaluate the detectors. To address this key challenge, we propose to train a bug detector in two phases, first on a synthetic bug distribution to adapt the model to the bug detection domain, and then on a real bug distribution to drive the model towards the real distribution. During these two phases, we leverage a multi-task hierarchy, focal loss, and contrastive learning to further boost performance. We evaluate our approach extensively on three widely studied bug types, for which we construct new datasets carefully designed to capture the real bug distribution. The results demonstrate that our approach is practically effective and successfully mitigates the distribution shift: our learned detectors are highly performant on both our test set and the latest version of open source repositories. Our code, datasets, and models are publicly available at https://github.com/eth-sri/learning-real-bug-detector.}\n}", "pdf": "https://proceedings.mlr.press/v162/he22a/he22a.pdf", "supp": "", "pdf_size": 392788, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16187870824460798751&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, ETH Zurich, Switzerland; Department of Computer Science, ETH Zurich, Switzerland; Department of Computer Science, ETH Zurich, Switzerland", "aff_domain": "inf.ethz.ch; ; ", "email": "inf.ethz.ch; ; ", "github": "https://github.com/eth-sri/learning-real-bug-detector", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/he22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "On Finite-Sample Identifiability of Contrastive Learning-Based Nonlinear Independent Component Analysis", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16871", "id": "16871", "proceeding": "https://proceedings.mlr.press/v162/lyu22a.html", "poster": "/media/PosterPDFs/ICML%202022/723dadb8c699bf14f74503dbcb6e09c1_94wjHxW.png?t=1657560860.1076531", "slides": "", "author_site": "Qi Lyu, Xiao Fu", "author": "Qi Lyu; Xiao Fu", "abstract": "Nonlinear independent component analysis (nICA) aims at recovering statistically independent latent components that are mixed by unknown nonlinear functions. Central to nICA is the identifiability of the latent components, which had been elusive until very recently. Specifically, Hyv\u00e4rinen et al. have shown that the nonlinearly mixed latent components are identifiable (up to often inconsequential ambiguities) under a generalized contrastive learning (GCL) formulation, given that the latent components are independent conditioned on a certain auxiliary variable. The GCL-based identifiability of nICA is elegant, and establishes interesting connections between nICA and popular unsupervised/self-supervised learning paradigms in representation learning, causal learning, and factor disentanglement. However, existing identifiability analyses of nICA all build upon an unlimited sample assumption and the use of ideal universal function learners\u2014which creates a non-negligible gap between theory and practice. Closing the gap is a nontrivial challenge, as there is a lack of established \u201ctextbook\u201d routine for finite sample analysis of such unsupervised problems. This work puts forth a finite-sample identifiability analysis of GCL-based nICA. Our analytical framework judiciously combines the properties of the GCL loss function, statistical generalization analysis, and numerical differentiation. Our framework also takes the learning function\u2019s approximation error into consideration, and reveals an intuitive trade-off between the complexity and expressiveness of the employed function learner. Numerical experiments are used to validate the theorems.", "bibtex": "@InProceedings{pmlr-v162-lyu22a,\n title = \t {On Finite-Sample Identifiability of Contrastive Learning-Based Nonlinear Independent Component Analysis},\n author = {Lyu, Qi and Fu, Xiao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14582--14600},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lyu22a/lyu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lyu22a.html},\n abstract = \t {Nonlinear independent component analysis (nICA) aims at recovering statistically independent latent components that are mixed by unknown nonlinear functions. Central to nICA is the identifiability of the latent components, which had been elusive until very recently. Specifically, Hyv\u00e4rinen et al. have shown that the nonlinearly mixed latent components are identifiable (up to often inconsequential ambiguities) under a generalized contrastive learning (GCL) formulation, given that the latent components are independent conditioned on a certain auxiliary variable. The GCL-based identifiability of nICA is elegant, and establishes interesting connections between nICA and popular unsupervised/self-supervised learning paradigms in representation learning, causal learning, and factor disentanglement. However, existing identifiability analyses of nICA all build upon an unlimited sample assumption and the use of ideal universal function learners\u2014which creates a non-negligible gap between theory and practice. Closing the gap is a nontrivial challenge, as there is a lack of established \u201ctextbook\u201d routine for finite sample analysis of such unsupervised problems. This work puts forth a finite-sample identifiability analysis of GCL-based nICA. Our analytical framework judiciously combines the properties of the GCL loss function, statistical generalization analysis, and numerical differentiation. Our framework also takes the learning function\u2019s approximation error into consideration, and reveals an intuitive trade-off between the complexity and expressiveness of the employed function learner. Numerical experiments are used to validate the theorems.}\n}", "pdf": "https://proceedings.mlr.press/v162/lyu22a/lyu22a.pdf", "supp": "", "pdf_size": 496267, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5958134825965348047&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of EECS, Oregon State University, Corvallis, OR, United States; School of EECS, Oregon State University, Corvallis, OR, United States", "aff_domain": "oregonstate.edu;oregonstate.edu", "email": "oregonstate.edu;oregonstate.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/lyu22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Oregon State University", "aff_unique_dep": "School of EECS", "aff_unique_url": "https://osu.edu", "aff_unique_abbr": "OSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Corvallis", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "On Implicit Bias in Overparameterized Bilevel Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18367", "id": "18367", "proceeding": "https://proceedings.mlr.press/v162/vicol22a.html", "poster": "/media/PosterPDFs/ICML%202022/27d52bcb3580724eb4cbe9f2718a9365.png?t=1657509985.7050664", "slides": "", "author_site": "Paul Vicol, Jonathan Lorraine, Fabian Pedregosa, David Duvenaud, Roger Grosse", "author": "Paul Vicol; Jonathan P Lorraine; Fabian Pedregosa; David Duvenaud; Roger B Grosse", "abstract": "Many problems in machine learning involve bilevel optimization (BLO), including hyperparameter optimization, meta-learning, and dataset distillation. Bilevel problems involve inner and outer parameters, each optimized for its own objective. Often, at least one of the two levels is underspecified and there are multiple ways to choose among equivalent optima. Inspired by recent studies of the implicit bias induced by optimization algorithms in single-level optimization, we investigate the implicit bias of different gradient-based algorithms for jointly optimizing the inner and outer parameters. We delineate two standard BLO methods\u2014cold-start and warm-start BLO\u2014and show that the converged solution or long-run behavior depends to a large degree on these and other algorithmic choices, such as the hypergradient approximation. We also show that the solutions from warm-start BLO can encode a surprising amount of information about the outer objective, even when the outer optimization variables are low-dimensional. We believe that implicit bias deserves as central a role in the study of bilevel optimization as it has attained in the study of single-level neural net optimization.", "bibtex": "@InProceedings{pmlr-v162-vicol22a,\n title = \t {On Implicit Bias in Overparameterized Bilevel Optimization},\n author = {Vicol, Paul and Lorraine, Jonathan P and Pedregosa, Fabian and Duvenaud, David and Grosse, Roger B},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22234--22259},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vicol22a/vicol22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/vicol22a.html},\n abstract = \t {Many problems in machine learning involve bilevel optimization (BLO), including hyperparameter optimization, meta-learning, and dataset distillation. Bilevel problems involve inner and outer parameters, each optimized for its own objective. Often, at least one of the two levels is underspecified and there are multiple ways to choose among equivalent optima. Inspired by recent studies of the implicit bias induced by optimization algorithms in single-level optimization, we investigate the implicit bias of different gradient-based algorithms for jointly optimizing the inner and outer parameters. We delineate two standard BLO methods\u2014cold-start and warm-start BLO\u2014and show that the converged solution or long-run behavior depends to a large degree on these and other algorithmic choices, such as the hypergradient approximation. We also show that the solutions from warm-start BLO can encode a surprising amount of information about the outer objective, even when the outer optimization variables are low-dimensional. We believe that implicit bias deserves as central a role in the study of bilevel optimization as it has attained in the study of single-level neural net optimization.}\n}", "pdf": "https://proceedings.mlr.press/v162/vicol22a/vicol22a.pdf", "supp": "", "pdf_size": 5698268, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7636925967785226800&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "aff": "University of Toronto + Vector Institute; University of Toronto + Vector Institute; Google Research; University of Toronto + Vector Institute; University of Toronto + Vector Institute", "aff_domain": "cs.toronto.edu; ; ; ; ", "email": "cs.toronto.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/vicol22a.html", "aff_unique_index": "0+1;0+1;2;0+1;0+1", "aff_unique_norm": "University of Toronto;Vector Institute;Google", "aff_unique_dep": ";;Google Research", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/;https://research.google", "aff_unique_abbr": "U of T;Vector Institute;Google Research", "aff_campus_unique_index": ";;1;;", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+0;0+0;1;0+0;0+0", "aff_country_unique": "Canada;United States" }, { "title": "On Improving Model-Free Algorithms for Decentralized Multi-Agent Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16667", "id": "16667", "proceeding": "https://proceedings.mlr.press/v162/mao22a.html", "poster": "/media/PosterPDFs/ICML%202022/0084ae4bc24c0795d1e6a4f58444d39b.png?t=1657411545.2425644", "slides": "", "author_site": "Weichao Mao, Lin Yang, Kaiqing Zhang, Tamer Basar", "author": "Weichao Mao; Lin Yang; Kaiqing Zhang; Tamer Basar", "abstract": "Multi-agent reinforcement learning (MARL) algorithms often suffer from an exponential sample complexity dependence on the number of agents, a phenomenon known as", "bibtex": "@InProceedings{pmlr-v162-mao22a,\n title = \t {On Improving Model-Free Algorithms for Decentralized Multi-Agent Reinforcement Learning},\n author = {Mao, Weichao and Yang, Lin and Zhang, Kaiqing and Basar, Tamer},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15007--15049},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mao22a/mao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mao22a.html},\n abstract = \t {Multi-agent reinforcement learning (MARL) algorithms often suffer from an exponential sample complexity dependence on the number of agents, a phenomenon known as", "pdf": "https://proceedings.mlr.press/v162/mao22a/mao22a.pdf", "supp": "", "pdf_size": 3313951, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16389730999081843503&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Electrical and Computer Engineering & Coordinated Science Laboratory, University of Illinois Urbana-Champaign; Department of Electrical and Computer Engineering, University of California, Los Angeles + DeepMind; Laboratory for Information & Decision Systems, Massachusetts Institute of Technology + Simons Institute for the Theory of Computing; Department of Electrical and Computer Engineering & Coordinated Science Laboratory, University of Illinois Urbana-Champaign", "aff_domain": "illinois.edu;ee.ucla.edu;mit.edu;illinois.edu", "email": "illinois.edu;ee.ucla.edu;mit.edu;illinois.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/mao22a.html", "aff_unique_index": "0;1+2;3+4;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of California, Los Angeles;DeepMind;Massachusetts Institute of Technology;Simons Institute for the Theory of Computing", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Electrical and Computer Engineering;;Laboratory for Information & Decision Systems;", "aff_unique_url": "https://illinois.edu;https://www.ucla.edu;https://deepmind.com;https://web.mit.edu;https://simons.berkeley.edu", "aff_unique_abbr": "UIUC;UCLA;DeepMind;MIT;", "aff_campus_unique_index": "0;1;3;0", "aff_campus_unique": "Urbana-Champaign;Los Angeles;;Cambridge", "aff_country_unique_index": "0;0+1;0+0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "On Last-Iterate Convergence Beyond Zero-Sum Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17011", "id": "17011", "proceeding": "https://proceedings.mlr.press/v162/anagnostides22a.html", "poster": "/media/PosterPDFs/ICML%202022/fcdf698a5d673435e0a5a6f9ffea05ca_eJ9Laz7.png?t=1657900226.4658601", "slides": "/media/icml-2022/Slides/17011_BJeFy3o.pdf", "author_site": "Ioannis Anagnostides, Ioannis Panageas, Gabriele Farina, Tuomas Sandholm", "author": "Ioannis Anagnostides; Ioannis Panageas; Gabriele Farina; Tuomas Sandholm", "abstract": "Most existing results about last-iterate convergence of learning dynamics are limited to two-player zero-sum games, and only apply under rigid assumptions about what dynamics the players follow. In this paper we provide new results and techniques that apply to broader families of games and learning dynamics. First, we show that in a class of games that includes constant-sum polymatrix and strategically zero-sum games, the trajectories of dynamics such as optimistic mirror descent (OMD) exhibit a boundedness property, which holds even when players employ different algorithms and prediction mechanisms. This property enables us to obtain $O(1/\\sqrt{T})$ rates and optimal $O(1)$ regret bounds. Our analysis also reveals a surprising property: OMD either reaches arbitrarily close to a Nash equilibrium or it outperforms the robust price of anarchy in efficiency. Moreover, for potential games we establish convergence to an $\\epsilon$-equilibrium after $O(1/\\epsilon^2)$ iterations for mirror descent under a broad class of regularizers, as well as optimal $O(1)$ regret bounds for OMD variants. Our framework also extends to near-potential games, and unifies known analyses for distributed learning in Fisher\u2019s market model. Finally, we analyze the convergence, efficiency, and robustness of optimistic gradient descent (OGD) in general-sum continuous games.", "bibtex": "@InProceedings{pmlr-v162-anagnostides22a,\n title = \t {On Last-Iterate Convergence Beyond Zero-Sum Games},\n author = {Anagnostides, Ioannis and Panageas, Ioannis and Farina, Gabriele and Sandholm, Tuomas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {536--581},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/anagnostides22a/anagnostides22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/anagnostides22a.html},\n abstract = \t {Most existing results about last-iterate convergence of learning dynamics are limited to two-player zero-sum games, and only apply under rigid assumptions about what dynamics the players follow. In this paper we provide new results and techniques that apply to broader families of games and learning dynamics. First, we show that in a class of games that includes constant-sum polymatrix and strategically zero-sum games, the trajectories of dynamics such as optimistic mirror descent (OMD) exhibit a boundedness property, which holds even when players employ different algorithms and prediction mechanisms. This property enables us to obtain $O(1/\\sqrt{T})$ rates and optimal $O(1)$ regret bounds. Our analysis also reveals a surprising property: OMD either reaches arbitrarily close to a Nash equilibrium or it outperforms the robust price of anarchy in efficiency. Moreover, for potential games we establish convergence to an $\\epsilon$-equilibrium after $O(1/\\epsilon^2)$ iterations for mirror descent under a broad class of regularizers, as well as optimal $O(1)$ regret bounds for OMD variants. Our framework also extends to near-potential games, and unifies known analyses for distributed learning in Fisher\u2019s market model. Finally, we analyze the convergence, efficiency, and robustness of optimistic gradient descent (OGD) in general-sum continuous games.}\n}", "pdf": "https://proceedings.mlr.press/v162/anagnostides22a/anagnostides22a.pdf", "supp": "", "pdf_size": 1105875, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9396143684666970283&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Carnegie Mellon University; University of California Irvine; Carnegie Mellon University; Carnegie Mellon University + Strategy Robot, Inc. + Optimized Markets, Inc. + Strategic Machine, Inc.", "aff_domain": "cs.cmu.edu; ; ; ", "email": "cs.cmu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/anagnostides22a.html", "aff_unique_index": "0;1;0;0+2+3+4", "aff_unique_norm": "Carnegie Mellon University;University of California, Irvine;Strategy Robot, Inc.;Optimized Markets, Inc.;Strategic Machine, Inc.", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.cmu.edu;https://www.uci.edu;;;", "aff_unique_abbr": "CMU;UCI;;;", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Irvine", "aff_country_unique_index": "0;0;0;0+0+0+0", "aff_country_unique": "United States" }, { "title": "On Learning Mixture of Linear Regressions in the Non-Realizable Setting", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16903", "id": "16903", "proceeding": "https://proceedings.mlr.press/v162/pal22b.html", "poster": "/media/PosterPDFs/ICML%202022/85690f81aadc1749175c187784afc9ee_pAOyUgJ.png?t=1657490379.4280791", "slides": "", "author_site": "Soumyabrata Pal, Arya Mazumdar, Rajat Sen, Avishek Ghosh", "author": "Soumyabrata Pal; Arya Mazumdar; Rajat Sen; Avishek Ghosh", "abstract": "While mixture of linear regressions (MLR) is a well-studied topic, prior works usually do not analyze such models for prediction error. In fact,", "bibtex": "@InProceedings{pmlr-v162-pal22b,\n title = \t {On Learning Mixture of Linear Regressions in the Non-Realizable Setting},\n author = {Pal, Soumyabrata and Mazumdar, Arya and Sen, Rajat and Ghosh, Avishek},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17202--17220},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pal22b/pal22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/pal22b.html},\n abstract = \t {While mixture of linear regressions (MLR) is a well-studied topic, prior works usually do not analyze such models for prediction error. In fact,", "pdf": "https://proceedings.mlr.press/v162/pal22b/pal22b.pdf", "supp": "", "pdf_size": 408442, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14927766509867561455&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Hal\u0131c\u0131o\u011flu Data Science Institute (HDSI), UC San Diego, USA; Hal\u0131c\u0131o\u011flu Data Science Institute (HDSI), UC San Diego, USA; Google Research, India; Google Research, Palo Alto, USA", "aff_domain": "ucsd.edu;ucsd.edu;google.com;google.com", "email": "ucsd.edu;ucsd.edu;google.com;google.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/pal22b.html", "aff_unique_index": "0;0;1;1", "aff_unique_norm": "University of California, San Diego;Google", "aff_unique_dep": "Hal\u0131c\u0131o\u011flu Data Science Institute;Google Research", "aff_unique_url": "https://ucsd.edu;https://research.google", "aff_unique_abbr": "UCSD;Google Research", "aff_campus_unique_index": "0;0;1;2", "aff_campus_unique": "San Diego;India;Palo Alto", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;India" }, { "title": "On Measuring Causal Contributions via do-interventions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17033", "id": "17033", "proceeding": "https://proceedings.mlr.press/v162/jung22a.html", "poster": "/media/PosterPDFs/ICML%202022/522e1ea43810e90242942ccc0995dae1.png?t=1658264403.558094", "slides": "/media/icml-2022/Slides/17033.pdf", "author_site": "Yonghan Jung, Shiva Kasiviswanathan, Jin Tian, Dominik Janzing, Patrick Bloebaum, Elias Bareinboim", "author": "Yonghan Jung; Shiva Kasiviswanathan; Jin Tian; Dominik Janzing; Patrick Bloebaum; Elias Bareinboim", "abstract": "Causal contributions measure the strengths of different causes to a target quantity. Understanding causal contributions is important in empirical sciences and data-driven disciplines since it allows to answer practical queries like \u201cwhat are the contributions of each cause to the effect?\u201d In this paper, we develop a principled method for quantifying causal contributions. First, we provide desiderata of properties axioms that causal contribution measures should satisfy and propose the do-Shapley values (inspired by do-interventions [Pearl, 2000]) as a unique method satisfying these properties. Next, we develop a criterion under which the do-Shapley values can be efficiently inferred from non-experimental data. Finally, we provide do-Shapley estimators exhibiting consistency, computational feasibility, and statistical robustness. Simulation results corroborate with the theory.", "bibtex": "@InProceedings{pmlr-v162-jung22a,\n title = \t {On Measuring Causal Contributions via do-interventions},\n author = {Jung, Yonghan and Kasiviswanathan, Shiva and Tian, Jin and Janzing, Dominik and Bloebaum, Patrick and Bareinboim, Elias},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10476--10501},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jung22a/jung22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jung22a.html},\n abstract = \t {Causal contributions measure the strengths of different causes to a target quantity. Understanding causal contributions is important in empirical sciences and data-driven disciplines since it allows to answer practical queries like \u201cwhat are the contributions of each cause to the effect?\u201d In this paper, we develop a principled method for quantifying causal contributions. First, we provide desiderata of properties axioms that causal contribution measures should satisfy and propose the do-Shapley values (inspired by do-interventions [Pearl, 2000]) as a unique method satisfying these properties. Next, we develop a criterion under which the do-Shapley values can be efficiently inferred from non-experimental data. Finally, we provide do-Shapley estimators exhibiting consistency, computational feasibility, and statistical robustness. Simulation results corroborate with the theory.}\n}", "pdf": "https://proceedings.mlr.press/v162/jung22a/jung22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/jung22a-supp.zip", "pdf_size": 1177722, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16599642198663870499&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Purdue University; Amazon; Iowa State University; Amazon; Amazon; Columbia University", "aff_domain": "purdue.edu; ; ; ; ; ", "email": "purdue.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/jung22a.html", "aff_unique_index": "0;1;2;1;1;3", "aff_unique_norm": "Purdue University;Amazon;Iowa State University;Columbia University", "aff_unique_dep": ";Amazon.com, Inc.;;", "aff_unique_url": "https://www.purdue.edu;https://www.amazon.com;https://www.iastate.edu;https://www.columbia.edu", "aff_unique_abbr": "Purdue;Amazon;ISU;Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "On Non-local Convergence Analysis of Deep Linear Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16811", "id": "16811", "proceeding": "https://proceedings.mlr.press/v162/chen22p.html", "poster": "/media/PosterPDFs/ICML%202022/49182f81e6a13cf5eaa496d51fea6406_K6IfVL4.png?t=1657383998.0560172", "slides": "", "author_site": "Kun Chen, Dachao Lin, Zhihua Zhang", "author": "Kun Chen; Dachao Lin; Zhihua Zhang", "abstract": "In this paper, we study the non-local convergence properties of deep linear networks. Specifically, under the quadratic loss, we consider optimizing deep linear networks in which there is at least a layer with only one neuron. We describe the convergent point of trajectories with an arbitrary balanced starting point under gradient flow, including the paths which converge to one of the saddle points. We also show specific convergence rates of trajectories that converge to the global minimizers by stages. We conclude that the rates vary from polynomial to linear. As far as we know, our results are the first to give a non-local analysis of deep linear neural networks with arbitrary balanced initialization, rather than the lazy training regime which has dominated the literature on neural networks or the restricted benign initialization.", "bibtex": "@InProceedings{pmlr-v162-chen22p,\n title = \t {On Non-local Convergence Analysis of Deep Linear Networks},\n author = {Chen, Kun and Lin, Dachao and Zhang, Zhihua},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3417--3443},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22p/chen22p.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22p.html},\n abstract = \t {In this paper, we study the non-local convergence properties of deep linear networks. Specifically, under the quadratic loss, we consider optimizing deep linear networks in which there is at least a layer with only one neuron. We describe the convergent point of trajectories with an arbitrary balanced starting point under gradient flow, including the paths which converge to one of the saddle points. We also show specific convergence rates of trajectories that converge to the global minimizers by stages. We conclude that the rates vary from polynomial to linear. As far as we know, our results are the first to give a non-local analysis of deep linear neural networks with arbitrary balanced initialization, rather than the lazy training regime which has dominated the literature on neural networks or the restricted benign initialization.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22p/chen22p.pdf", "supp": "", "pdf_size": 599238, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6445590660054568300&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "School of Mathematical Sciences, Peking University; Academy for Advanced Interdisciplinary Studies, Peking University; School of Mathematical Sciences, Peking University", "aff_domain": "math.pku.edu.cn;math.pku.edu.cn;math.pku.edu.cn", "email": "math.pku.edu.cn;math.pku.edu.cn;math.pku.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22p.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "School of Mathematical Sciences", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "PKU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "On Numerical Integration in Neural Ordinary Differential Equations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16941", "id": "16941", "proceeding": "https://proceedings.mlr.press/v162/zhu22f.html", "poster": "/media/PosterPDFs/ICML%202022/880610aa9f9de9ea7c545169c716f477_TRIFPKR.png?t=1657256915.3562164", "slides": "", "author_site": "Aiqing Zhu, Pengzhan Jin, Beibei Zhu, Yifa Tang", "author": "Aiqing Zhu; Pengzhan Jin; Beibei Zhu; Yifa Tang", "abstract": "The combination of ordinary differential equations and neural networks, i.e., neural ordinary differential equations (Neural ODE), has been widely studied from various angles. However, deciphering the numerical integration in Neural ODE is still an open challenge, as many researches demonstrated that numerical integration significantly affects the performance of the model. In this paper, we propose the inverse modified differential equations (IMDE) to clarify the influence of numerical integration on training Neural ODE models. IMDE is determined by the learning task and the employed ODE solver. It is shown that training a Neural ODE model actually returns a close approximation of the IMDE, rather than the true ODE. With the help of IMDE, we deduce that (i) the discrepancy between the learned model and the true ODE is bounded by the sum of discretization error and learning loss; (ii) Neural ODE using non-symplectic numerical integration fail to learn conservation laws theoretically. Several experiments are performed to numerically verify our theoretical analysis.", "bibtex": "@InProceedings{pmlr-v162-zhu22f,\n title = \t {On Numerical Integration in Neural Ordinary Differential Equations},\n author = {Zhu, Aiqing and Jin, Pengzhan and Zhu, Beibei and Tang, Yifa},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27527--27547},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhu22f/zhu22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhu22f.html},\n abstract = \t {The combination of ordinary differential equations and neural networks, i.e., neural ordinary differential equations (Neural ODE), has been widely studied from various angles. However, deciphering the numerical integration in Neural ODE is still an open challenge, as many researches demonstrated that numerical integration significantly affects the performance of the model. In this paper, we propose the inverse modified differential equations (IMDE) to clarify the influence of numerical integration on training Neural ODE models. IMDE is determined by the learning task and the employed ODE solver. It is shown that training a Neural ODE model actually returns a close approximation of the IMDE, rather than the true ODE. With the help of IMDE, we deduce that (i) the discrepancy between the learned model and the true ODE is bounded by the sum of discretization error and learning loss; (ii) Neural ODE using non-symplectic numerical integration fail to learn conservation laws theoretically. Several experiments are performed to numerically verify our theoretical analysis.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhu22f/zhu22f.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/zhu22f-supp.zip", "pdf_size": 532428, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1480049561976484832&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "LSEC, ICMSEC, Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Beijing 100190, China+School of Mathematical Sciences, University of Chinese Academy of Sciences, Beijing 100049, China; School of Mathematical Sciences, Peking University, Beijing 100871, China; School of Mathematics and Physics, University of Science and Technology Beijing, Beijing 100083, China; LSEC, ICMSEC, Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Beijing 100190, China+School of Mathematical Sciences, University of Chinese Academy of Sciences, Beijing 100049, China", "aff_domain": "lsec.cc.ac.cn; ; ;lsec.cc.ac.cn", "email": "lsec.cc.ac.cn; ; ;lsec.cc.ac.cn", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhu22f.html", "aff_unique_index": "0+1;2;3;0+1", "aff_unique_norm": "Chinese Academy of Sciences;University of Chinese Academy of Sciences;Peking University;University of Science and Technology Beijing", "aff_unique_dep": "Academy of Mathematics and Systems Science;School of Mathematical Sciences;School of Mathematical Sciences;School of Mathematics and Physics", "aff_unique_url": "http://www.cas.cn;http://www.ucas.ac.cn;http://www.pku.edu.cn;http://www.ustb.edu.cn", "aff_unique_abbr": "CAS;UCAS;PKU;USTB", "aff_campus_unique_index": "0+0;0;0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0;0;0+0", "aff_country_unique": "China" }, { "title": "On Transportation of Mini-batches: A Hierarchical Approach", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16711", "id": "16711", "proceeding": "https://proceedings.mlr.press/v162/nguyen22d.html", "poster": "/media/PosterPDFs/ICML%202022/f7e2b2b75b04175610e5a00c1e221ebb_MOpz2NV.png?t=1657398259.7545717", "slides": "", "author_site": "Khai Nguyen, Dang Nguyen, Quoc Nguyen, Tung Pham, Hung Bui, Dinh Phung, Trung Le, Nhat Ho", "author": "Khai Nguyen; Dang Nguyen; Quoc Dinh Nguyen; Tung Pham; Hung Bui; Dinh Phung; Trung Le; Nhat Ho", "abstract": "Mini-batch optimal transport (m-OT) has been successfully used in practical applications that involve probability measures with a very high number of supports. The m-OT solves several smaller optimal transport problems and then returns the average of their costs and transportation plans. Despite its scalability advantage, the m-OT does not consider the relationship between mini-batches which leads to undesirable estimation. Moreover, the m-OT does not approximate a proper metric between probability measures since the identity property is not satisfied. To address these problems, we propose a novel mini-batch scheme for optimal transport, named Batch of Mini-batches Optimal Transport (BoMb-OT), that finds the optimal coupling between mini-batches and it can be seen as an approximation to a well-defined distance on the space of probability measures. Furthermore, we show that the m-OT is a limit of the entropic regularized version of the BoMb-OT when the regularized parameter goes to infinity. Finally, we carry out experiments on various applications including deep generative models, deep domain adaptation, approximate Bayesian computation, color transfer, and gradient flow to show that the BoMb-OT can be widely applied and performs well in various applications.", "bibtex": "@InProceedings{pmlr-v162-nguyen22d,\n title = \t {On Transportation of Mini-batches: A Hierarchical Approach},\n author = {Nguyen, Khai and Nguyen, Dang and Nguyen, Quoc Dinh and Pham, Tung and Bui, Hung and Phung, Dinh and Le, Trung and Ho, Nhat},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16622--16655},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nguyen22d/nguyen22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/nguyen22d.html},\n abstract = \t {Mini-batch optimal transport (m-OT) has been successfully used in practical applications that involve probability measures with a very high number of supports. The m-OT solves several smaller optimal transport problems and then returns the average of their costs and transportation plans. Despite its scalability advantage, the m-OT does not consider the relationship between mini-batches which leads to undesirable estimation. Moreover, the m-OT does not approximate a proper metric between probability measures since the identity property is not satisfied. To address these problems, we propose a novel mini-batch scheme for optimal transport, named Batch of Mini-batches Optimal Transport (BoMb-OT), that finds the optimal coupling between mini-batches and it can be seen as an approximation to a well-defined distance on the space of probability measures. Furthermore, we show that the m-OT is a limit of the entropic regularized version of the BoMb-OT when the regularized parameter goes to infinity. Finally, we carry out experiments on various applications including deep generative models, deep domain adaptation, approximate Bayesian computation, color transfer, and gradient flow to show that the BoMb-OT can be widely applied and performs well in various applications.}\n}", "pdf": "https://proceedings.mlr.press/v162/nguyen22d/nguyen22d.pdf", "supp": "", "pdf_size": 30301052, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5470461083042798465&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Statistics and Data Sciences, The University of Texas at Austin; VinAI Research; VinAI Research; VinAI Research; VinAI Research; Monash University; Monash University; Department of Statistics and Data Sciences, The University of Texas at Austin", "aff_domain": "utexas.edu; ; ; ; ; ; ; ", "email": "utexas.edu; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/nguyen22d.html", "aff_unique_index": "0;1;1;1;1;2;2;0", "aff_unique_norm": "University of Texas at Austin;VinAI Research;Monash University", "aff_unique_dep": "Department of Statistics and Data Sciences;;", "aff_unique_url": "https://www.utexas.edu;https://www.vinai.io/;https://www.monash.edu", "aff_unique_abbr": "UT Austin;VinAI;Monash", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1;1;1;1;2;2;0", "aff_country_unique": "United States;Vietnam;Australia" }, { "title": "On Well-posedness and Minimax Optimal Rates of Nonparametric Q-function Estimation in Off-policy Evaluation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16057", "id": "16057", "proceeding": "https://proceedings.mlr.press/v162/chen22u.html", "poster": "/media/PosterPDFs/ICML%202022/04048aeca2c0f5d84639358008ed2ae7.png?t=1657650916.5372188", "slides": "", "author_site": "Xiaohong Chen, Zhengling Qi", "author": "Xiaohong Chen; Zhengling Qi", "abstract": "We study the off-policy evaluation (OPE) problem in an infinite-horizon Markov decision process with continuous states and actions. We recast the $Q$-function estimation into a special form of the nonparametric instrumental variables (NPIV) estimation problem. We first show that under one mild condition the NPIV formulation of $Q$-function estimation is well-posed in the sense of $L^2$-measure of ill-posedness with respect to the data generating distribution, bypassing a strong assumption on the discount factor $\\gamma$ imposed in the recent literature for obtaining the $L^2$ convergence rates of various $Q$-function estimators. Thanks to this new well-posed property, we derive the first minimax lower bounds for the convergence rates of nonparametric estimation of $Q$-function and its derivatives in both sup-norm and $L^2$-norm, which are shown to be the same as those for the classical nonparametric regression (Stone, 1982). We then propose a sieve two-stage least squares estimator and establish its rate-optimality in both norms under some mild conditions. Our general results on the well-posedness and the minimax lower bounds are of independent interest to study not only other nonparametric estimators for $Q$-function but also efficient estimation on the value of any target policy in off-policy settings.", "bibtex": "@InProceedings{pmlr-v162-chen22u,\n title = \t {On Well-posedness and Minimax Optimal Rates of Nonparametric Q-function Estimation in Off-policy Evaluation},\n author = {Chen, Xiaohong and Qi, Zhengling},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3558--3582},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22u/chen22u.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22u.html},\n abstract = \t {We study the off-policy evaluation (OPE) problem in an infinite-horizon Markov decision process with continuous states and actions. We recast the $Q$-function estimation into a special form of the nonparametric instrumental variables (NPIV) estimation problem. We first show that under one mild condition the NPIV formulation of $Q$-function estimation is well-posed in the sense of $L^2$-measure of ill-posedness with respect to the data generating distribution, bypassing a strong assumption on the discount factor $\\gamma$ imposed in the recent literature for obtaining the $L^2$ convergence rates of various $Q$-function estimators. Thanks to this new well-posed property, we derive the first minimax lower bounds for the convergence rates of nonparametric estimation of $Q$-function and its derivatives in both sup-norm and $L^2$-norm, which are shown to be the same as those for the classical nonparametric regression (Stone, 1982). We then propose a sieve two-stage least squares estimator and establish its rate-optimality in both norms under some mild conditions. Our general results on the well-posedness and the minimax lower bounds are of independent interest to study not only other nonparametric estimators for $Q$-function but also efficient estimation on the value of any target policy in off-policy settings.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22u/chen22u.pdf", "supp": "", "pdf_size": 495272, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10298790481846052301&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Cowles Foundation for Research in Economics, Yale University; Department of Decision Sciences, George Washington University", "aff_domain": "yale.edu;gwu.edu", "email": "yale.edu;gwu.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/chen22u.html", "aff_unique_index": "0;1", "aff_unique_norm": "Yale University;George Washington University", "aff_unique_dep": "Cowles Foundation for Research in Economics;Department of Decision Sciences", "aff_unique_url": "https://www.yale.edu;https://www.gwu.edu", "aff_unique_abbr": "Yale;GWU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "On the Adversarial Robustness of Causal Algorithmic Recourse", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18199", "id": "18199", "proceeding": "https://proceedings.mlr.press/v162/dominguez-olmedo22a.html", "poster": "/media/PosterPDFs/ICML%202022/831b1ac54cc8db480e3babac5fa2256b.png?t=1657189916.7134006", "slides": "/media/icml-2022/Slides/18199.pdf", "author_site": "Ricardo Dominguez-Olmedo, Amir Karimi, Bernhard Sch\u00f6lkopf", "author": "Ricardo Dominguez-Olmedo; Amir H Karimi; Bernhard Sch\u00f6lkopf", "abstract": "Algorithmic recourse seeks to provide actionable recommendations for individuals to overcome unfavorable classification outcomes from automated decision-making systems. Recourse recommendations should ideally be robust to reasonably small uncertainty in the features of the individual seeking recourse. In this work, we formulate the adversarially robust recourse problem and show that recourse methods that offer minimally costly recourse fail to be robust. We then present methods for generating adversarially robust recourse for linear and for differentiable classifiers. Finally, we show that regularizing the decision-making classifier to behave locally linearly and to rely more strongly on actionable features facilitates the existence of adversarially robust recourse.", "bibtex": "@InProceedings{pmlr-v162-dominguez-olmedo22a,\n title = \t {On the Adversarial Robustness of Causal Algorithmic Recourse},\n author = {Dominguez-Olmedo, Ricardo and Karimi, Amir H and Sch{\\\"o}lkopf, Bernhard},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5324--5342},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dominguez-olmedo22a/dominguez-olmedo22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dominguez-olmedo22a.html},\n abstract = \t {Algorithmic recourse seeks to provide actionable recommendations for individuals to overcome unfavorable classification outcomes from automated decision-making systems. Recourse recommendations should ideally be robust to reasonably small uncertainty in the features of the individual seeking recourse. In this work, we formulate the adversarially robust recourse problem and show that recourse methods that offer minimally costly recourse fail to be robust. We then present methods for generating adversarially robust recourse for linear and for differentiable classifiers. Finally, we show that regularizing the decision-making classifier to behave locally linearly and to rely more strongly on actionable features facilitates the existence of adversarially robust recourse.}\n}", "pdf": "https://proceedings.mlr.press/v162/dominguez-olmedo22a/dominguez-olmedo22a.pdf", "supp": "", "pdf_size": 693992, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16011924534958641945&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany+University of T\u00fcbingen, Germany; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany+ETH Z\u00fcrich, Switzerland; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", "aff_domain": "tuebingen.mpg.de; ; ", "email": "tuebingen.mpg.de; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/dominguez-olmedo22a.html", "aff_unique_index": "0+1;0+2;0", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of T\u00fcbingen;ETH Zurich", "aff_unique_dep": ";;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.uni-tuebingen.de/;https://www.ethz.ch", "aff_unique_abbr": "MPI-IS;Uni T\u00fcbingen;ETHZ", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "T\u00fcbingen;", "aff_country_unique_index": "0+0;0+1;0", "aff_country_unique": "Germany;Switzerland" }, { "title": "On the Convergence of Inexact Predictor-Corrector Methods for Linear Programming", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16319", "id": "16319", "proceeding": "https://proceedings.mlr.press/v162/dexter22a.html", "poster": "", "slides": "", "author_site": "Gregory Dexter, Agniva Chowdhury, Haim Avron, Petros Drineas", "author": "Gregory Dexter; Agniva Chowdhury; Haim Avron; Petros Drineas", "abstract": "Interior point methods (IPMs) are a common approach for solving linear programs (LPs) with strong theoretical guarantees and solid empirical performance. The time complexity of these methods is dominated by the cost of solving a linear system of equations at each iteration. In common applications of linear programming, particularly in machine learning and scientific computing, the size of this linear system can become prohibitively large, requiring the use of iterative solvers, which provide an approximate solution to the linear system. However, approximately solving the linear system at each iteration of an IPM invalidates the theoretical guarantees of common IPM analyses. To remedy this, we theoretically and empirically analyze (slightly modified) predictor-corrector IPMs when using approximate linear solvers: our approach guarantees that, when certain conditions are satisfied, the number of IPM iterations does not increase and that the final solution remains feasible. We also provide practical instantiations of approximate linear solvers that satisfy these conditions for special classes of constraint matrices using randomized linear algebra.", "bibtex": "@InProceedings{pmlr-v162-dexter22a,\n title = \t {On the Convergence of Inexact Predictor-Corrector Methods for Linear Programming},\n author = {Dexter, Gregory and Chowdhury, Agniva and Avron, Haim and Drineas, Petros},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5007--5038},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dexter22a/dexter22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dexter22a.html},\n abstract = \t {Interior point methods (IPMs) are a common approach for solving linear programs (LPs) with strong theoretical guarantees and solid empirical performance. The time complexity of these methods is dominated by the cost of solving a linear system of equations at each iteration. In common applications of linear programming, particularly in machine learning and scientific computing, the size of this linear system can become prohibitively large, requiring the use of iterative solvers, which provide an approximate solution to the linear system. However, approximately solving the linear system at each iteration of an IPM invalidates the theoretical guarantees of common IPM analyses. To remedy this, we theoretically and empirically analyze (slightly modified) predictor-corrector IPMs when using approximate linear solvers: our approach guarantees that, when certain conditions are satisfied, the number of IPM iterations does not increase and that the final solution remains feasible. We also provide practical instantiations of approximate linear solvers that satisfy these conditions for special classes of constraint matrices using randomized linear algebra.}\n}", "pdf": "https://proceedings.mlr.press/v162/dexter22a/dexter22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/dexter22a-supp.zip", "pdf_size": 1183125, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16628228121296314293&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, Purdue University, West Lafayette, IN, USA; Computer Science and Mathematics Division, Oak Ridge National Laboratory, TN, USA; School of Mathematical Sciences, Tel Aviv University, Tel Aviv, Israel; Department of Computer Science, Purdue University, West Lafayette, IN, USA", "aff_domain": "purdue.edu;ornl.gov;tauex.tau.ac.il;purdue.edu", "email": "purdue.edu;ornl.gov;tauex.tau.ac.il;purdue.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/dexter22a.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Purdue University;Oak Ridge National Laboratory;Tel Aviv University", "aff_unique_dep": "Department of Computer Science;Computer Science and Mathematics Division;School of Mathematical Sciences", "aff_unique_url": "https://www.purdue.edu;https://www.ornl.gov;https://www.tau.ac.il", "aff_unique_abbr": "Purdue;ORNL;TAU", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "West Lafayette;;Tel Aviv", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Israel" }, { "title": "On the Convergence of Local Stochastic Compositional Gradient Descent with Momentum", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16939", "id": "16939", "proceeding": "https://proceedings.mlr.press/v162/gao22c.html", "poster": "/media/PosterPDFs/ICML%202022/f1b8b7b3ceb65c188dcdc0851634cadf.png?t=1657419465.1578186", "slides": "", "author_site": "Hongchang Gao, Junyi Li, Heng Huang", "author": "Hongchang Gao; Junyi Li; Heng Huang", "abstract": "Federated Learning has been actively studied due to its efficiency in numerous real-world applications in the past few years. However, the federated stochastic compositional optimization problem is still underexplored, even though it has widespread applications in machine learning. In this paper, we developed a novel local stochastic compositional gradient descent with momentum method, which facilitates Federated Learning for the stochastic compositional problem. Importantly, we investigated the convergence rate of our proposed method and proved that it can achieve the $O(1/\\epsilon^4)$ sample complexity, which is better than existing methods. Meanwhile, our communication complexity $O(1/\\epsilon^3)$ can match existing methods. To the best of our knowledge, this is the first work achieving such favorable sample and communication complexities. Additionally, extensive experimental results demonstrate the superior empirical performance over existing methods, confirming the efficacy of our method.", "bibtex": "@InProceedings{pmlr-v162-gao22c,\n title = \t {On the Convergence of Local Stochastic Compositional Gradient Descent with Momentum},\n author = {Gao, Hongchang and Li, Junyi and Huang, Heng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7017--7035},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gao22c/gao22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/gao22c.html},\n abstract = \t {Federated Learning has been actively studied due to its efficiency in numerous real-world applications in the past few years. However, the federated stochastic compositional optimization problem is still underexplored, even though it has widespread applications in machine learning. In this paper, we developed a novel local stochastic compositional gradient descent with momentum method, which facilitates Federated Learning for the stochastic compositional problem. Importantly, we investigated the convergence rate of our proposed method and proved that it can achieve the $O(1/\\epsilon^4)$ sample complexity, which is better than existing methods. Meanwhile, our communication complexity $O(1/\\epsilon^3)$ can match existing methods. To the best of our knowledge, this is the first work achieving such favorable sample and communication complexities. Additionally, extensive experimental results demonstrate the superior empirical performance over existing methods, confirming the efficacy of our method.}\n}", "pdf": "https://proceedings.mlr.press/v162/gao22c/gao22c.pdf", "supp": "", "pdf_size": 735688, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3983417716527108294&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer and Information Sciences, Temple University, PA, USA; Department of Electrical and Computer Engineering, University of Pittsburgh, PA, USA; Department of Electrical and Computer Engineering, University of Pittsburgh, PA, USA", "aff_domain": "temple.edu;gmail.com;pitt.edu", "email": "temple.edu;gmail.com;pitt.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/gao22c.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Temple University;University of Pittsburgh", "aff_unique_dep": "Department of Computer and Information Sciences;Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.temple.edu;https://www.pitt.edu", "aff_unique_abbr": "Temple;Pitt", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "PA;Pittsburgh", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "On the Convergence of the Shapley Value in Parametric Bayesian Learning Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17063", "id": "17063", "proceeding": "https://proceedings.mlr.press/v162/agussurja22a.html", "poster": "/media/PosterPDFs/ICML%202022/d47bf0af618a3523a226ed7cada85ce3.png?t=1657539712.3896782", "slides": "", "author_site": "Lucas Agussurja, Xinyi Xu, Bryan Kian Hsiang Low", "author": "Lucas Agussurja; Xinyi Xu; Bryan Kian Hsiang Low", "abstract": "Measuring contributions is a classical problem in cooperative game theory where the Shapley value is the most well-known solution concept. In this paper, we establish the convergence property of the Shapley value in parametric Bayesian learning games where players perform a Bayesian inference using their combined data, and the posterior-prior KL divergence is used as the characteristic function. We show that for any two players, under some regularity conditions, their difference in Shapley value converges in probability to the difference in Shapley value of a limiting game whose characteristic function is proportional to the log-determinant of the joint Fisher information. As an application, we present an online collaborative learning framework that is asymptotically Shapley-fair. Our result enables this to be achieved without any costly computations of posterior-prior KL divergences. Only a consistent estimator of the Fisher information is needed. The effectiveness of our framework is demonstrated with experiments using real-world data.", "bibtex": "@InProceedings{pmlr-v162-agussurja22a,\n title = \t {On the Convergence of the Shapley Value in Parametric {B}ayesian Learning Games},\n author = {Agussurja, Lucas and Xu, Xinyi and Low, Bryan Kian Hsiang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {180--196},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/agussurja22a/agussurja22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/agussurja22a.html},\n abstract = \t {Measuring contributions is a classical problem in cooperative game theory where the Shapley value is the most well-known solution concept. In this paper, we establish the convergence property of the Shapley value in parametric Bayesian learning games where players perform a Bayesian inference using their combined data, and the posterior-prior KL divergence is used as the characteristic function. We show that for any two players, under some regularity conditions, their difference in Shapley value converges in probability to the difference in Shapley value of a limiting game whose characteristic function is proportional to the log-determinant of the joint Fisher information. As an application, we present an online collaborative learning framework that is asymptotically Shapley-fair. Our result enables this to be achieved without any costly computations of posterior-prior KL divergences. Only a consistent estimator of the Fisher information is needed. The effectiveness of our framework is demonstrated with experiments using real-world data.}\n}", "pdf": "https://proceedings.mlr.press/v162/agussurja22a/agussurja22a.pdf", "supp": "", "pdf_size": 954440, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7727281335591886084&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, National University of Singapore, Singapore + Institute for Infocomm Research, A\u2217STAR, Singapore; Department of Computer Science, National University of Singapore, Singapore + Institute for Infocomm Research, A\u2217STAR, Singapore; Department of Computer Science, National University of Singapore, Singapore", "aff_domain": "comp.nus.edu.sg;comp.nus.edu.sg;comp.nus.edu.sg", "email": "comp.nus.edu.sg;comp.nus.edu.sg;comp.nus.edu.sg", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/agussurja22a.html", "aff_unique_index": "0+1;0+1;0", "aff_unique_norm": "National University of Singapore;Institute for Infocomm Research", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.nus.edu.sg;https://www.i2r.a-star.edu.sg", "aff_unique_abbr": "NUS;I2R", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "Singapore" }, { "title": "On the Difficulty of Defending Self-Supervised Learning against Model Extraction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17053", "id": "17053", "proceeding": "https://proceedings.mlr.press/v162/dziedzic22a.html", "poster": "/media/PosterPDFs/ICML%202022/ccbd8ca962b80445df1f7f38c57759f0.png?t=1658255339.1772735", "slides": "", "author_site": "Adam Dziedzic, Nikita Dhawan, Muhammad Ahmad Kaleem, Jonas Guan, Nicolas Papernot", "author": "Adam Dziedzic; Nikita Dhawan; Muhammad Ahmad Kaleem; Jonas Guan; Nicolas Papernot", "abstract": "Self-Supervised Learning (SSL) is an increasingly popular ML paradigm that trains models to transform complex inputs into representations without relying on explicit labels. These representations encode similarity structures that enable efficient learning of multiple downstream tasks. Recently, ML-as-a-Service providers have commenced offering trained SSL models over inference APIs, which transform user inputs into useful representations for a fee. However, the high cost involved to train these models and their exposure over APIs both make black-box extraction a realistic security threat. We thus explore model stealing attacks against SSL. Unlike traditional model extraction on classifiers that output labels, the victim models here output representations; these representations are of significantly higher dimensionality compared to the low-dimensional prediction scores output by classifiers. We construct several novel attacks and find that approaches that train directly on a victim\u2019s stolen representations are query efficient and enable high accuracy for downstream models. We then show that existing defenses against model extraction are inadequate and not easily retrofitted to the specificities of SSL.", "bibtex": "@InProceedings{pmlr-v162-dziedzic22a,\n title = \t {On the Difficulty of Defending Self-Supervised Learning against Model Extraction},\n author = {Dziedzic, Adam and Dhawan, Nikita and Kaleem, Muhammad Ahmad and Guan, Jonas and Papernot, Nicolas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5757--5776},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dziedzic22a/dziedzic22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dziedzic22a.html},\n abstract = \t {Self-Supervised Learning (SSL) is an increasingly popular ML paradigm that trains models to transform complex inputs into representations without relying on explicit labels. These representations encode similarity structures that enable efficient learning of multiple downstream tasks. Recently, ML-as-a-Service providers have commenced offering trained SSL models over inference APIs, which transform user inputs into useful representations for a fee. However, the high cost involved to train these models and their exposure over APIs both make black-box extraction a realistic security threat. We thus explore model stealing attacks against SSL. Unlike traditional model extraction on classifiers that output labels, the victim models here output representations; these representations are of significantly higher dimensionality compared to the low-dimensional prediction scores output by classifiers. We construct several novel attacks and find that approaches that train directly on a victim\u2019s stolen representations are query efficient and enable high accuracy for downstream models. We then show that existing defenses against model extraction are inadequate and not easily retrofitted to the specificities of SSL.}\n}", "pdf": "https://proceedings.mlr.press/v162/dziedzic22a/dziedzic22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/dziedzic22a-supp.zip", "pdf_size": 587287, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16145224211258754535&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of Toronto+Vector Institute; University of Toronto+Vector Institute; University of Toronto+Vector Institute; University of Toronto+Vector Institute; University of Toronto+Vector Institute", "aff_domain": "utoronto.ca; ; ; ; ", "email": "utoronto.ca; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/dziedzic22a.html", "aff_unique_index": "0+1;0+1;0+1;0+1;0+1", "aff_unique_norm": "University of Toronto;Vector Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/", "aff_unique_abbr": "U of T;Vector Institute", "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", "aff_country_unique": "Canada" }, { "title": "On the Effects of Artificial Data Modification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16087", "id": "16087", "proceeding": "https://proceedings.mlr.press/v162/marcu22a.html", "poster": "/media/PosterPDFs/ICML%202022/afd4836712c5e77550897e25711e1d96.png?t=1657537441.2459643", "slides": "", "author_site": "Antonia Marcu, Adam Prugel-Bennett", "author": "Antonia Marcu; Adam Prugel-Bennett", "abstract": "Data distortion is commonly applied in vision models during both training (e.g methods like MixUp and CutMix) and evaluation (e.g. shape-texture bias and robustness). This data modification can introduce artificial information. It is often assumed that the resulting artefacts are detrimental to training, whilst being negligible when analysing models. We investigate these assumptions and conclude that in some cases they are unfounded and lead to incorrect results. Specifically, we show current shape bias identification methods and occlusion robustness measures are biased and propose a fairer alternative for the latter. Subsequently, through a series of experiments we seek to correct and strengthen the community\u2019s perception of how augmenting affects learning of vision models. Based on our empirical results we argue that the impact of the artefacts must be understood and exploited rather than eliminated.", "bibtex": "@InProceedings{pmlr-v162-marcu22a,\n title = \t {On the Effects of Artificial Data Modification},\n author = {Marcu, Antonia and Prugel-Bennett, Adam},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15050--15069},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/marcu22a/marcu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/marcu22a.html},\n abstract = \t {Data distortion is commonly applied in vision models during both training (e.g methods like MixUp and CutMix) and evaluation (e.g. shape-texture bias and robustness). This data modification can introduce artificial information. It is often assumed that the resulting artefacts are detrimental to training, whilst being negligible when analysing models. We investigate these assumptions and conclude that in some cases they are unfounded and lead to incorrect results. Specifically, we show current shape bias identification methods and occlusion robustness measures are biased and propose a fairer alternative for the latter. Subsequently, through a series of experiments we seek to correct and strengthen the community\u2019s perception of how augmenting affects learning of vision models. Based on our empirical results we argue that the impact of the artefacts must be understood and exploited rather than eliminated.}\n}", "pdf": "https://proceedings.mlr.press/v162/marcu22a/marcu22a.pdf", "supp": "", "pdf_size": 2141909, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5171301994487774624&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Vision, Learning and Control research group, University of Southampton; Vision, Learning and Control research group, University of Southampton", "aff_domain": "soton.ac.uk; ", "email": "soton.ac.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/marcu22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Southampton", "aff_unique_dep": "Vision, Learning and Control research group", "aff_unique_url": "https://www.southampton.ac.uk", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "On the Equivalence Between Temporal and Static Equivariant Graph Representations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16897", "id": "16897", "proceeding": "https://proceedings.mlr.press/v162/gao22e.html", "poster": "/media/PosterPDFs/ICML%202022/a2186aa7c086b46ad4e8bf81e2a3a19b.png?t=1657761552.025931", "slides": "", "author_site": "Jianfei Gao, Bruno Ribeiro", "author": "Jianfei Gao; Bruno Ribeiro", "abstract": "This work formalizes the associational task of predicting node attribute evolution in temporal graphs from the perspective of learning equivariant representations. We show that node representations in temporal graphs can be cast into two distinct frameworks: (a) The most popular approach, which we denote as time-and-graph, where equivariant graph (e.g., GNN) and sequence (e.g., RNN) representations are intertwined to represent the temporal evolution of node attributes in the graph; and (b) an approach that we denote as time-then-graph, where the sequences describing the node and edge dynamics are represented first, then fed as node and edge attributes into a static equivariant graph representation that comes after. Interestingly, we show that time-then-graph representations have an expressivity advantage over time-and-graph representations when both use component GNNs that are not most-expressive (e.g., 1-Weisfeiler-Lehman GNNs). Moreover, while our goal is not necessarily to obtain state-of-the-art results, our experiments show that time-then-graph methods are capable of achieving better performance and efficiency than state-of-the-art time-and-graph methods in some real-world tasks, thereby showcasing that the time-then-graph framework is a worthy addition to the graph ML toolbox.", "bibtex": "@InProceedings{pmlr-v162-gao22e,\n title = \t {On the Equivalence Between Temporal and Static Equivariant Graph Representations},\n author = {Gao, Jianfei and Ribeiro, Bruno},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7052--7076},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gao22e/gao22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/gao22e.html},\n abstract = \t {This work formalizes the associational task of predicting node attribute evolution in temporal graphs from the perspective of learning equivariant representations. We show that node representations in temporal graphs can be cast into two distinct frameworks: (a) The most popular approach, which we denote as time-and-graph, where equivariant graph (e.g., GNN) and sequence (e.g., RNN) representations are intertwined to represent the temporal evolution of node attributes in the graph; and (b) an approach that we denote as time-then-graph, where the sequences describing the node and edge dynamics are represented first, then fed as node and edge attributes into a static equivariant graph representation that comes after. Interestingly, we show that time-then-graph representations have an expressivity advantage over time-and-graph representations when both use component GNNs that are not most-expressive (e.g., 1-Weisfeiler-Lehman GNNs). Moreover, while our goal is not necessarily to obtain state-of-the-art results, our experiments show that time-then-graph methods are capable of achieving better performance and efficiency than state-of-the-art time-and-graph methods in some real-world tasks, thereby showcasing that the time-then-graph framework is a worthy addition to the graph ML toolbox.}\n}", "pdf": "https://proceedings.mlr.press/v162/gao22e/gao22e.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/gao22e-supp.zip", "pdf_size": 683279, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4415902235709647818&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, Purdue University, West Lafayette, IN 47906, USA; Department of Computer Science, Purdue University, West Lafayette, IN 47906, USA", "aff_domain": "purdue.edu;cs.purdue.edu", "email": "purdue.edu;cs.purdue.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/gao22e.html", "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "0;0", "aff_campus_unique": "West Lafayette", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "On the Finite-Time Complexity and Practical Computation of Approximate Stationarity Concepts of Lipschitz Functions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16347", "id": "16347", "proceeding": "https://proceedings.mlr.press/v162/tian22a.html", "poster": "", "slides": "", "author_site": "Lai Tian, Kaiwen Zhou, Anthony Man-Cho So", "author": "Lai Tian; Kaiwen Zhou; Anthony Man-Cho So", "abstract": "We report a practical finite-time algorithmic scheme to compute approximately stationary points for nonconvex nonsmooth Lipschitz functions. In particular, we are interested in two kinds of approximate stationarity notions for nonconvex nonsmooth problems, i.e., Goldstein approximate stationarity (GAS) and near-approximate stationarity (NAS). For GAS, our scheme removes the unrealistic subgradient selection oracle assumption in (Zhang et al., 2020, Assumption 1) and computes GAS with the same finite-time complexity. For NAS, Davis & Drusvyatskiy (2019) showed that $\\rho$-weakly convex functions admit finite-time computation, while Tian & So (2021) provided the matching impossibility results of dimension-free finite-time complexity for first-order methods. Complement to these developments, in this paper, we isolate a new class of functions that could be Clarke irregular (and thus not weakly convex anymore) and show that our new algorithmic scheme can compute NAS points for functions in that class within finite time. To demonstrate the wide applicability of our new theoretical framework, we show that $\\rho$-margin SVM, $1$-layer, and $2$-layer ReLU neural networks, all being Clarke irregular, satisfy our new conditions.", "bibtex": "@InProceedings{pmlr-v162-tian22a,\n title = \t {On the Finite-Time Complexity and Practical Computation of Approximate Stationarity Concepts of {L}ipschitz Functions},\n author = {Tian, Lai and Zhou, Kaiwen and So, Anthony Man-Cho},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21360--21379},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tian22a/tian22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tian22a.html},\n abstract = \t {We report a practical finite-time algorithmic scheme to compute approximately stationary points for nonconvex nonsmooth Lipschitz functions. In particular, we are interested in two kinds of approximate stationarity notions for nonconvex nonsmooth problems, i.e., Goldstein approximate stationarity (GAS) and near-approximate stationarity (NAS). For GAS, our scheme removes the unrealistic subgradient selection oracle assumption in (Zhang et al., 2020, Assumption 1) and computes GAS with the same finite-time complexity. For NAS, Davis & Drusvyatskiy (2019) showed that $\\rho$-weakly convex functions admit finite-time computation, while Tian & So (2021) provided the matching impossibility results of dimension-free finite-time complexity for first-order methods. Complement to these developments, in this paper, we isolate a new class of functions that could be Clarke irregular (and thus not weakly convex anymore) and show that our new algorithmic scheme can compute NAS points for functions in that class within finite time. To demonstrate the wide applicability of our new theoretical framework, we show that $\\rho$-margin SVM, $1$-layer, and $2$-layer ReLU neural networks, all being Clarke irregular, satisfy our new conditions.}\n}", "pdf": "https://proceedings.mlr.press/v162/tian22a/tian22a.pdf", "supp": "", "pdf_size": 1632120, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9566084202640220946&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Systems Engineering & Engineering Management, The Chinese University of Hong Kong, Shatin, NT, Hong Kong; Department of Computer Science and Engineering, The Chinese University of Hong Kong, Shatin, NT, Hong Kong; Department of Systems Engineering & Engineering Management, The Chinese University of Hong Kong, Shatin, NT, Hong Kong", "aff_domain": "se.cuhk.edu.hk; ;se.cuhk.edu.hk", "email": "se.cuhk.edu.hk; ;se.cuhk.edu.hk", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/tian22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Chinese University of Hong Kong", "aff_unique_dep": "Department of Systems Engineering & Engineering Management", "aff_unique_url": "https://www.cuhk.edu.hk", "aff_unique_abbr": "CUHK", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "On the Finite-Time Performance of the Knowledge Gradient Algorithm", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16171", "id": "16171", "proceeding": "https://proceedings.mlr.press/v162/li22f.html", "poster": "/media/PosterPDFs/ICML%202022/37f0e884fbad9667e38940169d0a3c95.png?t=1656580824.795241", "slides": "/media/icml-2022/Slides/16171.pdf", "author_site": "Yanwen Li, Siyang Gao", "author": "Yanwen Li; Siyang Gao", "abstract": "The knowledge gradient (KG) algorithm is a popular and effective algorithm for the best arm identification (BAI) problem. Due to the complex calculation of KG, theoretical analysis of this algorithm is difficult, and existing results are mostly about the asymptotic performance of it, e.g., consistency, asymptotic sample allocation, etc. In this research, we present new theoretical results about the finite-time performance of the KG algorithm. Under independent and normally distributed rewards, we derive lower bounds and upper bounds for the probability of error and simple regret of the algorithm. With these bounds, existing asymptotic results become simple corollaries. We also show the performance of the algorithm for the multi-armed bandit (MAB) problem. These developments not only extend the existing analysis of the KG algorithm, but can also be used to analyze other improvement-based algorithms. Last, we use numerical experiments to further demonstrate the finite-time behavior of the KG algorithm.", "bibtex": "@InProceedings{pmlr-v162-li22f,\n title = \t {On the Finite-Time Performance of the Knowledge Gradient Algorithm},\n author = {Li, Yanwen and Gao, Siyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12741--12764},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22f/li22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22f.html},\n abstract = \t {The knowledge gradient (KG) algorithm is a popular and effective algorithm for the best arm identification (BAI) problem. Due to the complex calculation of KG, theoretical analysis of this algorithm is difficult, and existing results are mostly about the asymptotic performance of it, e.g., consistency, asymptotic sample allocation, etc. In this research, we present new theoretical results about the finite-time performance of the KG algorithm. Under independent and normally distributed rewards, we derive lower bounds and upper bounds for the probability of error and simple regret of the algorithm. With these bounds, existing asymptotic results become simple corollaries. We also show the performance of the algorithm for the multi-armed bandit (MAB) problem. These developments not only extend the existing analysis of the KG algorithm, but can also be used to analyze other improvement-based algorithms. Last, we use numerical experiments to further demonstrate the finite-time behavior of the KG algorithm.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22f/li22f.pdf", "supp": "", "pdf_size": 1811904, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7204727118937739599&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Advanced Design and Systems Engineering, City University of Hong Kong, Hong Kong+School of Data Science, City University of Hong Kong, Hong Kong; School of Data Science, City University of Hong Kong, Hong Kong", "aff_domain": "my.cityu.edu.hk;cityu.edu.hk", "email": "my.cityu.edu.hk;cityu.edu.hk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/li22f.html", "aff_unique_index": "0+0;0", "aff_unique_norm": "City University of Hong Kong", "aff_unique_dep": "Department of Advanced Design and Systems Engineering", "aff_unique_url": "https://www.cityu.edu.hk", "aff_unique_abbr": "CityU", "aff_campus_unique_index": "0+0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0+0;0", "aff_country_unique": "China" }, { "title": "On the Generalization Analysis of Adversarial Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18077", "id": "18077", "proceeding": "https://proceedings.mlr.press/v162/mustafa22a.html", "poster": "/media/PosterPDFs/ICML%202022/492114f6915a69aa3dd005aa4233ef51.png?t=1658067122.3672378", "slides": "", "author_site": "Waleed Mustafa, Yunwen Lei, Marius Kloft", "author": "Waleed Mustafa; Yunwen Lei; Marius Kloft", "abstract": "Many recent studies have highlighted the susceptibility of virtually all machine-learning models to adversarial attacks. Adversarial attacks are imperceptible changes to an input example of a given prediction model. Such changes are carefully designed to alter the otherwise correct prediction of the model. In this paper, we study the generalization properties of adversarial learning. In particular, we derive high-probability generalization bounds on the adversarial risk in terms of the empirical adversarial risk, the complexity of the function class and the adversarial noise set. Our bounds are generally applicable to many models, losses, and adversaries. We showcase its applicability by deriving adversarial generalization bounds for the multi-class classification setting and various prediction models (including linear models and Deep Neural Networks). We also derive optimistic adversarial generalization bounds for the case of smooth losses. These are the first fast-rate bounds valid for adversarial deep learning to the best of our knowledge.", "bibtex": "@InProceedings{pmlr-v162-mustafa22a,\n title = \t {On the Generalization Analysis of Adversarial Learning},\n author = {Mustafa, Waleed and Lei, Yunwen and Kloft, Marius},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16174--16196},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mustafa22a/mustafa22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mustafa22a.html},\n abstract = \t {Many recent studies have highlighted the susceptibility of virtually all machine-learning models to adversarial attacks. Adversarial attacks are imperceptible changes to an input example of a given prediction model. Such changes are carefully designed to alter the otherwise correct prediction of the model. In this paper, we study the generalization properties of adversarial learning. In particular, we derive high-probability generalization bounds on the adversarial risk in terms of the empirical adversarial risk, the complexity of the function class and the adversarial noise set. Our bounds are generally applicable to many models, losses, and adversaries. We showcase its applicability by deriving adversarial generalization bounds for the multi-class classification setting and various prediction models (including linear models and Deep Neural Networks). We also derive optimistic adversarial generalization bounds for the case of smooth losses. These are the first fast-rate bounds valid for adversarial deep learning to the best of our knowledge.}\n}", "pdf": "https://proceedings.mlr.press/v162/mustafa22a/mustafa22a.pdf", "supp": "", "pdf_size": 408991, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6109213687879120048&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Computer Science, University of Kaiserslautern, Germany; School of Computer Science, University of Birmingham, United Kingdom; Department of Computer Science, University of Kaiserslautern, Germany", "aff_domain": "bham.ac.uk; ; ", "email": "bham.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/mustafa22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Kaiserslautern;University of Birmingham", "aff_unique_dep": "Department of Computer Science;School of Computer Science", "aff_unique_url": "https://www.uni-kl.de;https://www.birmingham.ac.uk", "aff_unique_abbr": ";UoB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Birmingham", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Germany;United Kingdom" }, { "title": "On the Hidden Biases of Policy Mirror Ascent in Continuous Action Spaces", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16503", "id": "16503", "proceeding": "https://proceedings.mlr.press/v162/bedi22a.html", "poster": "/media/PosterPDFs/ICML%202022/8336041a6899d0bce657dcd29409cf7e_QMvus9a.png?t=1658069951.1711853", "slides": "", "author_site": "Amrit Singh Bedi, Souradip Chakraborty, Anjaly Parayil, Brian Sadler, Pratap Tokekar, Alec Koppel", "author": "Amrit Singh Bedi; Souradip Chakraborty; Anjaly Parayil; Brian M Sadler; Pratap Tokekar; Alec Koppel", "abstract": "We focus on parameterized policy search for reinforcement learning over continuous action spaces. Typically, one assumes the score function associated with a policy is bounded, which {fails to hold even for Gaussian policies. } To properly address this issue, one must introduce an exploration tolerance parameter to quantify the region in which it is bounded. Doing so incurs a persistent bias that appears in the attenuation rate of the expected policy gradient norm, which is inversely proportional to the radius of the action space. To mitigate this hidden bias, heavy-tailed policy parameterizations may be used, which exhibit a bounded score function, but doing so can cause instability in algorithmic updates. To address these issues, in this work, we study the convergence of policy gradient algorithms under heavy-tailed parameterizations, which we propose to stabilize with a combination of mirror ascent-type updates and gradient tracking. Our main theoretical contribution is the establishment that this scheme converges with constant batch sizes, whereas prior works require these parameters to respectively shrink to null or grow to infinity. Experimentally, this scheme under a heavy-tailed policy parameterization yields improved reward accumulation across a variety of settings as compared with standard benchmarks.", "bibtex": "@InProceedings{pmlr-v162-bedi22a,\n title = \t {On the Hidden Biases of Policy Mirror Ascent in Continuous Action Spaces},\n author = {Bedi, Amrit Singh and Chakraborty, Souradip and Parayil, Anjaly and Sadler, Brian M and Tokekar, Pratap and Koppel, Alec},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1716--1731},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bedi22a/bedi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bedi22a.html},\n abstract = \t {We focus on parameterized policy search for reinforcement learning over continuous action spaces. Typically, one assumes the score function associated with a policy is bounded, which {fails to hold even for Gaussian policies. } To properly address this issue, one must introduce an exploration tolerance parameter to quantify the region in which it is bounded. Doing so incurs a persistent bias that appears in the attenuation rate of the expected policy gradient norm, which is inversely proportional to the radius of the action space. To mitigate this hidden bias, heavy-tailed policy parameterizations may be used, which exhibit a bounded score function, but doing so can cause instability in algorithmic updates. To address these issues, in this work, we study the convergence of policy gradient algorithms under heavy-tailed parameterizations, which we propose to stabilize with a combination of mirror ascent-type updates and gradient tracking. Our main theoretical contribution is the establishment that this scheme converges with constant batch sizes, whereas prior works require these parameters to respectively shrink to null or grow to infinity. Experimentally, this scheme under a heavy-tailed policy parameterization yields improved reward accumulation across a variety of settings as compared with standard benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/bedi22a/bedi22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/bedi22a-supp.zip", "pdf_size": 989163, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6702624764457746219&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Institute of Systems Research, University of Maryland, College Park, USA; Department of Computer Science, University of Maryland, College Park, USA; Microsoft Research, India; DEVCOM Army Research Laboratory, Adelphi, MD, USA; Department of Computer Science, University of Maryland, College Park, USA; JP Morgan Chase AI Research, USA", "aff_domain": "umd.edu; ; ; ; ;gmail.com", "email": "umd.edu; ; ; ; ;gmail.com", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/bedi22a.html", "aff_unique_index": "0;1;2;3;1;4", "aff_unique_norm": "University of Maryland;University of Maryland, College Park;Microsoft;DEVCOM Army Research Laboratory;JP Morgan Chase", "aff_unique_dep": "Institute of Systems Research;Department of Computer Science;Microsoft Research;;AI Research", "aff_unique_url": "https://www.umd.edu;https://www/umd.edu;https://www.microsoft.com/en-us/research/group/india.aspx;;https://www.jpmorganchase.com", "aff_unique_abbr": "UMD;UMD;MSR India;;JPM", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "College Park;;Adelphi", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "United States;India" }, { "title": "On the Impossibility of Learning to Cooperate with Adaptive Partner Strategies in Repeated Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17013", "id": "17013", "proceeding": "https://proceedings.mlr.press/v162/loftin22a.html", "poster": "/media/PosterPDFs/ICML%202022/294a8ed24b1ad22ec2e7efea049b8737.png?t=1657225657.7794948", "slides": "", "author_site": "Robert Loftin, Frans Oliehoek", "author": "Robert Loftin; Frans A Oliehoek", "abstract": "Learning to cooperate with other agents is challenging when those agents also possess the ability to adapt to our own behavior. Practical and theoretical approaches to learning in cooperative settings typically assume that other agents\u2019 behaviors are stationary, or else make very specific assumptions about other agents\u2019 learning processes. The goal of this work is to understand whether we can reliably learn to cooperate with other agents without such restrictive assumptions, which are unlikely to hold in real-world applications. Our main contribution is a set of impossibility results, which show that no learning algorithm can reliably learn to cooperate with all possible adaptive partners in a repeated matrix game, even if that partner is guaranteed to cooperate with some stationary strategy. Motivated by these results, we then discuss potential alternative assumptions which capture the idea that an adaptive partner will only adapt rationally to our behavior.", "bibtex": "@InProceedings{pmlr-v162-loftin22a,\n title = \t {On the Impossibility of Learning to Cooperate with Adaptive Partner Strategies in Repeated Games},\n author = {Loftin, Robert and Oliehoek, Frans A},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14197--14209},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/loftin22a/loftin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/loftin22a.html},\n abstract = \t {Learning to cooperate with other agents is challenging when those agents also possess the ability to adapt to our own behavior. Practical and theoretical approaches to learning in cooperative settings typically assume that other agents\u2019 behaviors are stationary, or else make very specific assumptions about other agents\u2019 learning processes. The goal of this work is to understand whether we can reliably learn to cooperate with other agents without such restrictive assumptions, which are unlikely to hold in real-world applications. Our main contribution is a set of impossibility results, which show that no learning algorithm can reliably learn to cooperate with all possible adaptive partners in a repeated matrix game, even if that partner is guaranteed to cooperate with some stationary strategy. Motivated by these results, we then discuss potential alternative assumptions which capture the idea that an adaptive partner will only adapt rationally to our behavior.}\n}", "pdf": "https://proceedings.mlr.press/v162/loftin22a/loftin22a.pdf", "supp": "", "pdf_size": 287557, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=896999126077156249&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Intelligent Systems, Delft University of Technology, Delft, South-Holland, The Netherlands; Department of Intelligent Systems, Delft University of Technology, Delft, South-Holland, The Netherlands", "aff_domain": "tudelft.nl; ", "email": "tudelft.nl; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/loftin22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Delft University of Technology", "aff_unique_dep": "Department of Intelligent Systems", "aff_unique_url": "https://www.tudelft.nl", "aff_unique_abbr": "TUDelft", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Delft", "aff_country_unique_index": "0;0", "aff_country_unique": "Netherlands" }, { "title": "On the Learning of Non-Autoregressive Transformers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17613", "id": "17613", "proceeding": "https://proceedings.mlr.press/v162/huang22k.html", "poster": "/media/PosterPDFs/ICML%202022/b5507f51b88a3ae4a99ba87e4877ab57.png?t=1657798765.492803", "slides": "/media/icml-2022/Slides/17613.pdf", "author_site": "Fei Huang, Tianhua Tao, Hao Zhou, Lei Li, Minlie Huang", "author": "Fei Huang; Tianhua Tao; Hao Zhou; Lei Li; Minlie Huang", "abstract": "Non-autoregressive Transformer (NAT) is a family of text generation models, which aims to reduce the decoding latency by predicting the whole sentences in parallel. However, such latency reduction sacrifices the ability to capture left-to-right dependencies, thereby making NAT learning very challenging. In this paper, we present theoretical and empirical analyses to reveal the challenges of NAT learning and propose a unified perspective to understand existing successes. First, we show that simply training NAT by maximizing the likelihood can lead to an approximation of marginal distributions but drops all dependencies between tokens, where the dropped information can be measured by the dataset\u2019s conditional total correlation. Second, we formalize many previous objectives in a unified framework and show that their success can be concluded as maximizing the likelihood on a proxy distribution, leading to a reduced information loss. Empirical studies show that our perspective can explain the phenomena in NAT learning and guide the design of new training methods.", "bibtex": "@InProceedings{pmlr-v162-huang22k,\n title = \t {On the Learning of Non-Autoregressive Transformers},\n author = {Huang, Fei and Tao, Tianhua and Zhou, Hao and Li, Lei and Huang, Minlie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9356--9376},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22k/huang22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22k.html},\n abstract = \t {Non-autoregressive Transformer (NAT) is a family of text generation models, which aims to reduce the decoding latency by predicting the whole sentences in parallel. However, such latency reduction sacrifices the ability to capture left-to-right dependencies, thereby making NAT learning very challenging. In this paper, we present theoretical and empirical analyses to reveal the challenges of NAT learning and propose a unified perspective to understand existing successes. First, we show that simply training NAT by maximizing the likelihood can lead to an approximation of marginal distributions but drops all dependencies between tokens, where the dropped information can be measured by the dataset\u2019s conditional total correlation. Second, we formalize many previous objectives in a unified framework and show that their success can be concluded as maximizing the likelihood on a proxy distribution, leading to a reduced information loss. Empirical studies show that our perspective can explain the phenomena in NAT learning and guide the design of new training methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22k/huang22k.pdf", "supp": "", "pdf_size": 937112, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13837504914625883439&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "The CoAI group, Tsinghua University. Institute for Artificial Intelligence, State Key Lab of Intelligent Technology and Systems, Beijing National Research Center for Information Science and Technology, DCST, Tsinghua University; The CoAI group, Tsinghua University. Institute for Artificial Intelligence, State Key Lab of Intelligent Technology and Systems, Beijing National Research Center for Information Science and Technology, DCST, Tsinghua University; Institute for AI Industry Research, Tsinghua University; University of California Santa Barbara; The CoAI group, Tsinghua University. Institute for Artificial Intelligence, State Key Lab of Intelligent Technology and Systems, Beijing National Research Center for Information Science and Technology, DCST, Tsinghua University", "aff_domain": "gmail.com;tsinghua.edu.cn; ; ; ", "email": "gmail.com;tsinghua.edu.cn; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/huang22k.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Tsinghua University;University of California, Santa Barbara", "aff_unique_dep": "CoAI group;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ucsb.edu", "aff_unique_abbr": "THU;UCSB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "On the Optimization Landscape of Neural Collapse under MSE Loss: Global Optimality with Unconstrained Features", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16983", "id": "16983", "proceeding": "https://proceedings.mlr.press/v162/zhou22c.html", "poster": "/media/PosterPDFs/ICML%202022/148510031349642de5ca0c544f31b2ef.png?t=1657935184.0027301", "slides": "", "author_site": "Jinxin Zhou, Xiao Li, Tianyu Ding, Chong You, Qing Qu, Zhihui Zhu", "author": "Jinxin Zhou; Xiao Li; Tianyu Ding; Chong You; Qing Qu; Zhihui Zhu", "abstract": "When training deep neural networks for classification tasks, an intriguing empirical phenomenon has been widely observed in the last-layer classifiers and features, where (i) the class means and the last-layer classifiers all collapse to the vertices of a Simplex Equiangular Tight Frame (ETF) up to scaling, and (ii) cross-example within-class variability of last-layer activations collapses to zero. This phenomenon is called Neural Collapse (NC), which seems to take place regardless of the choice of loss functions. In this work, we justify NC under the mean squared error (MSE) loss, where recent empirical evidence shows that it performs comparably or even better than the de-facto cross-entropy loss. Under a simplified unconstrained feature model, we provide the first global landscape analysis for vanilla nonconvex MSE loss and show that the (only!) global minimizers are neural collapse solutions, while all other critical points are strict saddles whose Hessian exhibit negative curvature directions. Furthermore, we justify the usage of rescaled MSE loss by probing the optimization landscape around the NC solutions, showing that the landscape can be improved by tuning the rescaling hyperparameters. Finally, our theoretical findings are experimentally verified on practical network architectures.", "bibtex": "@InProceedings{pmlr-v162-zhou22c,\n title = \t {On the Optimization Landscape of Neural Collapse under {MSE} Loss: Global Optimality with Unconstrained Features},\n author = {Zhou, Jinxin and Li, Xiao and Ding, Tianyu and You, Chong and Qu, Qing and Zhu, Zhihui},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27179--27202},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22c/zhou22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22c.html},\n abstract = \t {When training deep neural networks for classification tasks, an intriguing empirical phenomenon has been widely observed in the last-layer classifiers and features, where (i) the class means and the last-layer classifiers all collapse to the vertices of a Simplex Equiangular Tight Frame (ETF) up to scaling, and (ii) cross-example within-class variability of last-layer activations collapses to zero. This phenomenon is called Neural Collapse (NC), which seems to take place regardless of the choice of loss functions. In this work, we justify NC under the mean squared error (MSE) loss, where recent empirical evidence shows that it performs comparably or even better than the de-facto cross-entropy loss. Under a simplified unconstrained feature model, we provide the first global landscape analysis for vanilla nonconvex MSE loss and show that the (only!) global minimizers are neural collapse solutions, while all other critical points are strict saddles whose Hessian exhibit negative curvature directions. Furthermore, we justify the usage of rescaled MSE loss by probing the optimization landscape around the NC solutions, showing that the landscape can be improved by tuning the rescaling hyperparameters. Finally, our theoretical findings are experimentally verified on practical network architectures.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22c/zhou22c.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/zhou22c-supp.zip", "pdf_size": 1485551, "gs_citation": 125, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16569131647959807305&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Electrical & Computer Engineering, University of Denver, Denver, USA+1; Department of Electrical & Computer Engineering, University of Michigan, Ann Arbor, USA+2; Microsoft, Redmond, USA; Google Research, New York City, USA; Department of Electrical & Computer Engineering, University of Michigan, Ann Arbor, USA+2; Department of Electrical & Computer Engineering, University of Denver, Denver, USA+1", "aff_domain": "du.edu;umich.edu;\u5fae\u8f6f.com;google.com;umich.edu;du.edu", "email": "du.edu;umich.edu;\u5fae\u8f6f.com;google.com;umich.edu;du.edu", "github": "https://github.com/jinxinzhou/neural-collapse-MSE", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhou22c.html", "aff_unique_index": "0;2;3;4;2;0", "aff_unique_norm": "University of Denver;;University of Michigan;Microsoft;Google", "aff_unique_dep": "Department of Electrical & Computer Engineering;;Department of Electrical & Computer Engineering;Microsoft Corporation;Google Research", "aff_unique_url": "https://www.du.edu;;https://www.umich.edu;https://www.microsoft.com;https://research.google", "aff_unique_abbr": "DU;;UM;Microsoft;Google Research", "aff_campus_unique_index": "0;2;3;4;2;0", "aff_campus_unique": "Denver;;Ann Arbor;Redmond;New York City", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "title": "On the Practicality of Deterministic Epistemic Uncertainty", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16365", "id": "16365", "proceeding": "https://proceedings.mlr.press/v162/postels22a.html", "poster": "/media/PosterPDFs/ICML%202022/48aedb8880cab8c45637abc7493ecddd.png?t=1658181810.515972", "slides": "/media/icml-2022/Slides/16365.pdf", "author_site": "Janis Postels, Mattia Seg\u00f9, Tao Sun, Luca Daniel Sieber, Luc Van Gool, Fisher Yu, Federico Tombari", "author": "Janis Postels; Mattia Seg\u00f9; Tao Sun; Luca Daniel Sieber; Luc Van Gool; Fisher Yu; Federico Tombari", "abstract": "A set of novel approaches for estimating epistemic uncertainty in deep neural networks with a single forward pass has recently emerged as a valid alternative to Bayesian Neural Networks. On the premise of informative representations, these deterministic uncertainty methods (DUMs) achieve strong performance on detecting out-of-distribution (OOD) data while adding negligible computational costs at inference time. However, it remains unclear whether DUMs are well calibrated and can seamlessly scale to real-world applications - both prerequisites for their practical deployment. To this end, we first provide a taxonomy of DUMs, and evaluate their calibration under continuous distributional shifts. Then, we extend them to semantic segmentation. We find that, while DUMs scale to realistic vision tasks and perform well on OOD detection, the practicality of current methods is undermined by poor calibration under distributional shifts.", "bibtex": "@InProceedings{pmlr-v162-postels22a,\n title = \t {On the Practicality of Deterministic Epistemic Uncertainty},\n author = {Postels, Janis and Seg{\\`u}, Mattia and Sun, Tao and Sieber, Luca Daniel and Van Gool, Luc and Yu, Fisher and Tombari, Federico},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17870--17909},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/postels22a/postels22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/postels22a.html},\n abstract = \t {A set of novel approaches for estimating epistemic uncertainty in deep neural networks with a single forward pass has recently emerged as a valid alternative to Bayesian Neural Networks. On the premise of informative representations, these deterministic uncertainty methods (DUMs) achieve strong performance on detecting out-of-distribution (OOD) data while adding negligible computational costs at inference time. However, it remains unclear whether DUMs are well calibrated and can seamlessly scale to real-world applications - both prerequisites for their practical deployment. To this end, we first provide a taxonomy of DUMs, and evaluate their calibration under continuous distributional shifts. Then, we extend them to semantic segmentation. We find that, while DUMs scale to realistic vision tasks and perform well on OOD detection, the practicality of current methods is undermined by poor calibration under distributional shifts.}\n}", "pdf": "https://proceedings.mlr.press/v162/postels22a/postels22a.pdf", "supp": "", "pdf_size": 9141400, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10237983835645354047&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "ETH Zurich; ETH Zurich; ETH Zurich; ETH Zurich; ETH Zurich; ETH Zurich; Technical University Munich + Google", "aff_domain": "ethz.ch;ethz.ch; ; ; ; ; ", "email": "ethz.ch;ethz.ch; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/postels22a.html", "aff_unique_index": "0;0;0;0;0;0;1+2", "aff_unique_norm": "ETH Zurich;Technical University of Munich;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.ethz.ch;https://www.tum.de;https://www.google.com", "aff_unique_abbr": "ETHZ;TUM;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;1+2", "aff_country_unique": "Switzerland;Germany;United States" }, { "title": "On the Robustness of CountSketch to Adaptive Inputs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18043", "id": "18043", "proceeding": "https://proceedings.mlr.press/v162/cohen22a.html", "poster": "/media/PosterPDFs/ICML%202022/b65f2ecd2900ba6ae49a14d9c4b16fb4_s7kG69K.png?t=1658148718.9737954", "slides": "", "author_site": "Edith Cohen, Xin Lyu, Jelani Nelson, Tamas Sarlos, Moshe Shechner, Uri Stemmer", "author": "Edith Cohen; Xin Lyu; Jelani Nelson; Tamas Sarlos; Moshe Shechner; Uri Stemmer", "abstract": "The last decade saw impressive progress towards understanding the performance of algorithms in", "bibtex": "@InProceedings{pmlr-v162-cohen22a,\n title = \t {On the Robustness of {C}ount{S}ketch to Adaptive Inputs},\n author = {Cohen, Edith and Lyu, Xin and Nelson, Jelani and Sarlos, Tamas and Shechner, Moshe and Stemmer, Uri},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4112--4140},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cohen22a/cohen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/cohen22a.html},\n abstract = \t {The last decade saw impressive progress towards understanding the performance of algorithms in", "pdf": "https://proceedings.mlr.press/v162/cohen22a/cohen22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/cohen22a-supp.zip", "pdf_size": 1580604, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1018825247812712770&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Google Research + Tel Aviv University; UC Berkeley; UC Berkeley; Google Research; Tel Aviv University; Tel Aviv University", "aff_domain": "cohenwang.com;gmail.com;alum.mit.edu;google.com;gmail.com;uri.co.il", "email": "cohenwang.com;gmail.com;alum.mit.edu;google.com;gmail.com;uri.co.il", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/cohen22a.html", "aff_unique_index": "0+1;2;2;0;1;1", "aff_unique_norm": "Google;Tel Aviv University;University of California, Berkeley", "aff_unique_dep": "Google Research;;", "aff_unique_url": "https://research.google;https://www.tau.ac.il;https://www.berkeley.edu", "aff_unique_abbr": "Google Research;TAU;UC Berkeley", "aff_campus_unique_index": "0;2;2;0", "aff_campus_unique": "Mountain View;;Berkeley", "aff_country_unique_index": "0+1;0;0;0;1;1", "aff_country_unique": "United States;Israel" }, { "title": "On the Role of Discount Factor in Offline Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18281", "id": "18281", "proceeding": "https://proceedings.mlr.press/v162/hu22d.html", "poster": "/media/PosterPDFs/ICML%202022/b112ca4087d668785e947a57493d1740.png?t=1657892698.903578", "slides": "", "author_site": "Hao Hu, yiqin yang, Qianchuan Zhao, Chongjie Zhang", "author": "Hao Hu; Yiqin Yang; Qianchuan Zhao; Chongjie Zhang", "abstract": "Offline reinforcement learning (RL) enables effective learning from previously collected data without exploration, which shows great promise in real-world applications when exploration is expensive or even infeasible. The discount factor, $\\gamma$, plays a vital role in improving online RL sample efficiency and estimation accuracy, but the role of the discount factor in offline RL is not well explored. This paper examines two distinct effects of $\\gamma$ in offline RL with theoretical analysis, namely the regularization effect and the pessimism effect. On the one hand, $\\gamma$ is a regulator to trade-off optimality with sample efficiency upon existing offline techniques. On the other hand, lower guidance $\\gamma$ can also be seen as a way of pessimism where we optimize the policy\u2019s performance in the worst possible models. We empirically verify the above theoretical observation with tabular MDPs and standard D4RL tasks. The results show that the discount factor plays an essential role in the performance of offline RL algorithms, both under small data regimes upon existing offline methods and in large data regimes without other conservative methods.", "bibtex": "@InProceedings{pmlr-v162-hu22d,\n title = \t {On the Role of Discount Factor in Offline Reinforcement Learning},\n author = {Hu, Hao and Yang, Yiqin and Zhao, Qianchuan and Zhang, Chongjie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9072--9098},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hu22d/hu22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/hu22d.html},\n abstract = \t {Offline reinforcement learning (RL) enables effective learning from previously collected data without exploration, which shows great promise in real-world applications when exploration is expensive or even infeasible. The discount factor, $\\gamma$, plays a vital role in improving online RL sample efficiency and estimation accuracy, but the role of the discount factor in offline RL is not well explored. This paper examines two distinct effects of $\\gamma$ in offline RL with theoretical analysis, namely the regularization effect and the pessimism effect. On the one hand, $\\gamma$ is a regulator to trade-off optimality with sample efficiency upon existing offline techniques. On the other hand, lower guidance $\\gamma$ can also be seen as a way of pessimism where we optimize the policy\u2019s performance in the worst possible models. We empirically verify the above theoretical observation with tabular MDPs and standard D4RL tasks. The results show that the discount factor plays an essential role in the performance of offline RL algorithms, both under small data regimes upon existing offline methods and in large data regimes without other conservative methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/hu22d/hu22d.pdf", "supp": "", "pdf_size": 1283374, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4559230627634203753&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Institute of Interdisciplinary Information Sciences, Tsinghua University, Beijing, China; Institute of Interdisciplinary Information Sciences, Tsinghua University, Beijing, China; Department of Automation, Tsinghua University, Beijing, China; Institute of Interdisciplinary Information Sciences, Tsinghua University, Beijing, China", "aff_domain": "mails.tsinghua.edu.cn;mails.tsinghua.edu.cn; ; ", "email": "mails.tsinghua.edu.cn;mails.tsinghua.edu.cn; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/hu22d.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Institute of Interdisciplinary Information Sciences", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "On the Sample Complexity of Learning Infinite-horizon Discounted Linear Kernel MDPs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17895", "id": "17895", "proceeding": "https://proceedings.mlr.press/v162/chen22f.html", "poster": "", "slides": "", "author_site": "Yuanzhou Chen, Jiafan He, Quanquan Gu", "author": "Yuanzhou Chen; Jiafan He; Quanquan Gu", "abstract": "We study reinforcement learning for infinite-horizon discounted linear kernel MDPs, where the transition probability function is linear in a predefined feature mapping. Existing UCLK \\citep{zhou2020provably} algorithm for this setting only has a regret guarantee, which cannot lead to a tight sample complexity bound. In this paper, we extend the uniform-PAC sample complexity from episodic setting to the infinite-horizon discounted setting, and propose a novel algorithm dubbed UPAC-UCLK that achieves an $\\Tilde{O}\\big(d^2/((1-\\gamma)^4\\epsilon^2)+1/((1-\\gamma)^6\\epsilon^2)\\big)$ uniform-PAC sample complexity, where $d$ is the dimension of the feature mapping, $\\gamma \\in(0,1)$ is the discount factor of the MDP and $\\epsilon$ is the accuracy parameter. To the best of our knowledge, this is the first $\\tilde{O}(1/\\epsilon^2)$ sample complexity bound for learning infinite-horizon discounted MDPs with linear function approximation (without access to the generative model).", "bibtex": "@InProceedings{pmlr-v162-chen22f,\n title = \t {On the Sample Complexity of Learning Infinite-horizon Discounted Linear Kernel {MDP}s},\n author = {Chen, Yuanzhou and He, Jiafan and Gu, Quanquan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3149--3183},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22f/chen22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22f.html},\n abstract = \t {We study reinforcement learning for infinite-horizon discounted linear kernel MDPs, where the transition probability function is linear in a predefined feature mapping. Existing UCLK \\citep{zhou2020provably} algorithm for this setting only has a regret guarantee, which cannot lead to a tight sample complexity bound. In this paper, we extend the uniform-PAC sample complexity from episodic setting to the infinite-horizon discounted setting, and propose a novel algorithm dubbed UPAC-UCLK that achieves an $\\Tilde{O}\\big(d^2/((1-\\gamma)^4\\epsilon^2)+1/((1-\\gamma)^6\\epsilon^2)\\big)$ uniform-PAC sample complexity, where $d$ is the dimension of the feature mapping, $\\gamma \\in(0,1)$ is the discount factor of the MDP and $\\epsilon$ is the accuracy parameter. To the best of our knowledge, this is the first $\\tilde{O}(1/\\epsilon^2)$ sample complexity bound for learning infinite-horizon discounted MDPs with linear function approximation (without access to the generative model).}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22f/chen22f.pdf", "supp": "", "pdf_size": 485206, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=782522848354227479&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "School of Mathematical Sciences, Peking University, Beijing, China; Computer Science Department, University of California at Los Angeles, Los Angeles, California, USA; Computer Science Department, University of California at Los Angeles, Los Angeles, California, USA", "aff_domain": "cs.ucla.edu; ;cs.ucla.edu", "email": "cs.ucla.edu; ;cs.ucla.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22f.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Peking University;University of California, Los Angeles", "aff_unique_dep": "School of Mathematical Sciences;Computer Science Department", "aff_unique_url": "http://www.pku.edu.cn;https://www.ucla.edu", "aff_unique_abbr": "PKU;UCLA", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Beijing;Los Angeles", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "title": "On the Statistical Benefits of Curriculum Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16551", "id": "16551", "proceeding": "https://proceedings.mlr.press/v162/xu22i.html", "poster": "/media/PosterPDFs/ICML%202022/c5658c711ba9170700fc7d3ee3f63e40.png?t=1657575894.067732", "slides": "", "author_site": "Ziping Xu, Ambuj Tewari", "author": "Ziping Xu; Ambuj Tewari", "abstract": "Curriculum learning (CL) is a commonly used machine learning training strategy. However, we still lack a clear theoretical understanding of CL\u2019s benefits. In this paper, we study the benefits of CL in the multitask linear regression problem under both structured and unstructured settings. For both settings, we derive the minimax rates for CL with the oracle that provides the optimal curriculum and without the oracle, where the agent has to adaptively learn a good curriculum. Our results reveal that adaptive learning can be fundamentally harder than the oracle learning in the unstructured setting, but it merely introduces a small extra term in the structured setting. To connect theory with practice, we provide justification for a popular empirical method that selects tasks with highest local prediction gain by comparing its guarantees with the minimax rates mentioned above.", "bibtex": "@InProceedings{pmlr-v162-xu22i,\n title = \t {On the Statistical Benefits of Curriculum Learning},\n author = {Xu, Ziping and Tewari, Ambuj},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24663--24682},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22i/xu22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22i.html},\n abstract = \t {Curriculum learning (CL) is a commonly used machine learning training strategy. However, we still lack a clear theoretical understanding of CL\u2019s benefits. In this paper, we study the benefits of CL in the multitask linear regression problem under both structured and unstructured settings. For both settings, we derive the minimax rates for CL with the oracle that provides the optimal curriculum and without the oracle, where the agent has to adaptively learn a good curriculum. Our results reveal that adaptive learning can be fundamentally harder than the oracle learning in the unstructured setting, but it merely introduces a small extra term in the structured setting. To connect theory with practice, we provide justification for a popular empirical method that selects tasks with highest local prediction gain by comparing its guarantees with the minimax rates mentioned above.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22i/xu22i.pdf", "supp": "", "pdf_size": 427815, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4829985818584902408&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Statistics, University of Michigan, Ann Arbor; Department of Statistics, University of Michigan, Ann Arbor", "aff_domain": "umich.edu; ", "email": "umich.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/xu22i.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ann Arbor", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "On the Surrogate Gap between Contrastive and Supervised Losses", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17263", "id": "17263", "proceeding": "https://proceedings.mlr.press/v162/bao22e.html", "poster": "/media/PosterPDFs/ICML%202022/3d773b5ce67533d1b5b52d9b57936860.png?t=1657846813.8614562", "slides": "", "author_site": "Han Bao, Yoshihiro Nagano, Kento Nozawa", "author": "Han Bao; Yoshihiro Nagano; Kento Nozawa", "abstract": "Contrastive representation learning encourages data representation to make semantically similar pairs closer than randomly drawn negative samples, which has been successful in various domains such as vision, language, and graphs. Recent theoretical studies have attempted to explain the benefit of the large negative sample size by upper-bounding the downstream classification loss with the contrastive loss. However, the previous surrogate bounds have two drawbacks: they are only legitimate for a limited range of negative sample sizes and prohibitively large even within that range. Due to these drawbacks, there still does not exist a consensus on how negative sample size theoretically correlates with downstream classification performance. Following the simplified setting where positive pairs are drawn from the true distribution (not generated by data augmentation; as supposed in previous studies), this study establishes surrogate upper and lower bounds for the downstream classification loss for all negative sample sizes that best explain the empirical observations on the negative sample size in the earlier studies. Our bounds suggest that the contrastive loss can be viewed as a surrogate objective of the downstream loss and larger negative sample sizes improve downstream classification because the surrogate gap between contrastive and supervised losses decays. We verify that our theory is consistent with experiments on synthetic, vision, and language datasets.", "bibtex": "@InProceedings{pmlr-v162-bao22e,\n title = \t {On the Surrogate Gap between Contrastive and Supervised Losses},\n author = {Bao, Han and Nagano, Yoshihiro and Nozawa, Kento},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1585--1606},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bao22e/bao22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/bao22e.html},\n abstract = \t {Contrastive representation learning encourages data representation to make semantically similar pairs closer than randomly drawn negative samples, which has been successful in various domains such as vision, language, and graphs. Recent theoretical studies have attempted to explain the benefit of the large negative sample size by upper-bounding the downstream classification loss with the contrastive loss. However, the previous surrogate bounds have two drawbacks: they are only legitimate for a limited range of negative sample sizes and prohibitively large even within that range. Due to these drawbacks, there still does not exist a consensus on how negative sample size theoretically correlates with downstream classification performance. Following the simplified setting where positive pairs are drawn from the true distribution (not generated by data augmentation; as supposed in previous studies), this study establishes surrogate upper and lower bounds for the downstream classification loss for all negative sample sizes that best explain the empirical observations on the negative sample size in the earlier studies. Our bounds suggest that the contrastive loss can be viewed as a surrogate objective of the downstream loss and larger negative sample sizes improve downstream classification because the surrogate gap between contrastive and supervised losses decays. We verify that our theory is consistent with experiments on synthetic, vision, and language datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/bao22e/bao22e.pdf", "supp": "", "pdf_size": 2458232, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17468865477895467662&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "The University of Tokyo, Tokyo, Japan + RIKEN AIP, Tokyo, Japan; The University of Tokyo, Tokyo, Japan + RIKEN AIP, Tokyo, Japan; The University of Tokyo, Tokyo, Japan + RIKEN AIP, Tokyo, Japan", "aff_domain": "i.kyoto-u.ac.jp; ; ", "email": "i.kyoto-u.ac.jp; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/bao22e.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "University of Tokyo;RIKEN AIP", "aff_unique_dep": ";", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://aip.Riken.jp", "aff_unique_abbr": "UTokyo;RIKEN AIP", "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Tokyo", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "Japan" }, { "title": "One-Pass Algorithms for MAP Inference of Nonsymmetric Determinantal Point Processes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17339", "id": "17339", "proceeding": "https://proceedings.mlr.press/v162/reddy22a.html", "poster": "/media/PosterPDFs/ICML%202022/4462bf0ddbe0d0da40e1e828ebebeb11.png?t=1658163156.893128", "slides": "", "author_site": "Aravind Reddy, Ryan A. Rossi, Zhao Song, Anup Rao, Tung Mai, Nedim Lipka, Gang Wu, Eunyee Koh, Nesreen K Ahmed", "author": "Aravind Reddy; Ryan A. Rossi; Zhao Song; Anup Rao; Tung Mai; Nedim Lipka; Gang Wu; Eunyee Koh; Nesreen Ahmed", "abstract": "In this paper, we initiate the study of one-pass algorithms for solving the maximum-a-posteriori (MAP) inference problem for Non-symmetric Determinantal Point Processes (NDPPs). In particular, we formulate streaming and online versions of the problem and provide one-pass algorithms for solving these problems. In our streaming setting, data points arrive in an arbitrary order and the algorithms are constrained to use a single-pass over the data as well as sub-linear memory, and only need to output a valid solution at the end of the stream. Our online setting has an additional requirement of maintaining a valid solution at any point in time. We design new one-pass algorithms for these problems and show that they perform comparably to (or even better than) the offline greedy algorithm while using substantially lower memory.", "bibtex": "@InProceedings{pmlr-v162-reddy22a,\n title = \t {One-Pass Algorithms for {MAP} Inference of Nonsymmetric Determinantal Point Processes},\n author = {Reddy, Aravind and Rossi, Ryan A. and Song, Zhao and Rao, Anup and Mai, Tung and Lipka, Nedim and Wu, Gang and Koh, Eunyee and Ahmed, Nesreen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18463--18482},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/reddy22a/reddy22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/reddy22a.html},\n abstract = \t {In this paper, we initiate the study of one-pass algorithms for solving the maximum-a-posteriori (MAP) inference problem for Non-symmetric Determinantal Point Processes (NDPPs). In particular, we formulate streaming and online versions of the problem and provide one-pass algorithms for solving these problems. In our streaming setting, data points arrive in an arbitrary order and the algorithms are constrained to use a single-pass over the data as well as sub-linear memory, and only need to output a valid solution at the end of the stream. Our online setting has an additional requirement of maintaining a valid solution at any point in time. We design new one-pass algorithms for these problems and show that they perform comparably to (or even better than) the offline greedy algorithm while using substantially lower memory.}\n}", "pdf": "https://proceedings.mlr.press/v162/reddy22a/reddy22a.pdf", "supp": "", "pdf_size": 1436119, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13549866217317052186&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Northwestern University; Adobe Research; Adobe Research; Adobe Research; Adobe Research; Adobe Research; Adobe Research; Adobe Research; Intel Labs", "aff_domain": "cs.northwestern.edu; ; ; ; ; ; ; ; ", "email": "cs.northwestern.edu; ; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 9, "oa": "https://proceedings.mlr.press/v162/reddy22a.html", "aff_unique_index": "0;1;1;1;1;1;1;1;2", "aff_unique_norm": "Northwestern University;Adobe;Intel", "aff_unique_dep": ";Adobe Research;Intel Labs", "aff_unique_url": "https://www.northwestern.edu;https://research.adobe.com;https://www.intel.com", "aff_unique_abbr": "NU;Adobe;Intel", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "One-Pass Diversified Sampling with Application to Terabyte-Scale Genomic Sequence Streams", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16139", "id": "16139", "proceeding": "https://proceedings.mlr.press/v162/coleman22a.html", "poster": "/media/PosterPDFs/ICML%202022/977f6f5f8bd5054b7ec5f72374d5e68d.png?t=1658116895.9454436", "slides": "", "author_site": "Benjamin Coleman, Benito Geordie, Li Chou, R. A. Leo Elworth, Todd Treangen, Anshumali Shrivastava", "author": "Benjamin Coleman; Benito Geordie; Li Chou; R. A. Leo Elworth; Todd Treangen; Anshumali Shrivastava", "abstract": "A popular approach to reduce the size of a massive dataset is to apply efficient online sampling to the stream of data as it is read or generated. Online sampling routines are currently restricted to variations of reservoir sampling, where each sample is selected uniformly and independently of other samples. This renders them unsuitable for large-scale applications in computational biology, such as metagenomic community profiling and protein function annotation, which suffer from severe class imbalance. To maintain a representative and diverse sample, we must identify and preferentially select data that are likely to belong to rare classes. We argue that existing schemes for diversity sampling have prohibitive overhead for large-scale problems and high-throughput streams. We propose an efficient sampling routine that uses an online representation of the data distribution as a prefilter to retain elements from rare groups. We apply this method to several genomic data analysis tasks and demonstrate significant speedup in downstream analysis without sacrificing the quality of the results. Because our algorithm is 2x faster and uses 1000x less memory than coreset, reservoir and sketch-based alternatives, we anticipate that it will become a useful preprocessing step for applications with large-scale streaming data.", "bibtex": "@InProceedings{pmlr-v162-coleman22a,\n title = \t {One-Pass Diversified Sampling with Application to Terabyte-Scale Genomic Sequence Streams},\n author = {Coleman, Benjamin and Geordie, Benito and Chou, Li and Elworth, R. A. Leo and Treangen, Todd and Shrivastava, Anshumali},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4202--4218},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/coleman22a/coleman22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/coleman22a.html},\n abstract = \t {A popular approach to reduce the size of a massive dataset is to apply efficient online sampling to the stream of data as it is read or generated. Online sampling routines are currently restricted to variations of reservoir sampling, where each sample is selected uniformly and independently of other samples. This renders them unsuitable for large-scale applications in computational biology, such as metagenomic community profiling and protein function annotation, which suffer from severe class imbalance. To maintain a representative and diverse sample, we must identify and preferentially select data that are likely to belong to rare classes. We argue that existing schemes for diversity sampling have prohibitive overhead for large-scale problems and high-throughput streams. We propose an efficient sampling routine that uses an online representation of the data distribution as a prefilter to retain elements from rare groups. We apply this method to several genomic data analysis tasks and demonstrate significant speedup in downstream analysis without sacrificing the quality of the results. Because our algorithm is 2x faster and uses 1000x less memory than coreset, reservoir and sketch-based alternatives, we anticipate that it will become a useful preprocessing step for applications with large-scale streaming data.}\n}", "pdf": "https://proceedings.mlr.press/v162/coleman22a/coleman22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/coleman22a-supp.zip", "pdf_size": 2656427, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3027061590869303232&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Electrical and Computer Engineering, Rice University, Houston TX, USA+Third AI, Houston TX, USA; Department of Computer Science, Rice University, Houston, TX, USA+Third AI, Houston TX, USA; Department of Engineering and Computer Science, West Texas A&M University, Canyon TX, USA; Department of Computer Science, Rice University, Houston, TX, USA; Department of Computer Science, Rice University, Houston, TX, USA; Department of Computer Science, Rice University, Houston, TX, USA+Third AI, Houston TX, USA", "aff_domain": "rice.edu;rice.edu; ;rice.edu; ;rice.edu", "email": "rice.edu;rice.edu; ;rice.edu; ;rice.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/coleman22a.html", "aff_unique_index": "0+1;0+1;2;0;0;0+1", "aff_unique_norm": "Rice University;Third AI;West Texas A&M University", "aff_unique_dep": "Department of Electrical and Computer Engineering;;Department of Engineering and Computer Science", "aff_unique_url": "https://www.rice.edu;;https://www.wtamu.edu", "aff_unique_abbr": "Rice;;WTAMU", "aff_campus_unique_index": "0+0;0+0;1;0;0;0+0", "aff_campus_unique": "Houston;Canyon", "aff_country_unique_index": "0+0;0+0;0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "Online Active Regression", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17813", "id": "17813", "proceeding": "https://proceedings.mlr.press/v162/chen22l.html", "poster": "/media/PosterPDFs/ICML%202022/2c601ad9d2ff9bc8b282670cdd54f69f.png?t=1657955124.0565698", "slides": "", "author_site": "Cheng Chen, Yi Li, Yiming Sun", "author": "Cheng Chen; Yi Li; Yiming Sun", "abstract": "Active regression considers a linear regression problem where the learner receives a large number of data points but can only observe a small number of labels. Since online algorithms can deal with incremental training data and take advantage of low computational cost, we consider an online extension of the active regression problem: the learner receives data points one by one and immediately decides whether it should collect the corresponding labels. The goal is to efficiently maintain the regression of received data points with a small budget of label queries. We propose novel algorithms for this problem under $\\ell_p$ loss where $p\\in[1,2]$. To achieve a $(1+\\epsilon)$-approximate solution, our proposed algorithms only requires $\\tilde{\\mathcal{O}}(d/poly(\\epsilon))$ queries of labels. The numerical results verify our theoretical results and show that our methods have comparable performance with offline active regression algorithms.", "bibtex": "@InProceedings{pmlr-v162-chen22l,\n title = \t {Online Active Regression},\n author = {Chen, Cheng and Li, Yi and Sun, Yiming},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3320--3335},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22l/chen22l.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22l.html},\n abstract = \t {Active regression considers a linear regression problem where the learner receives a large number of data points but can only observe a small number of labels. Since online algorithms can deal with incremental training data and take advantage of low computational cost, we consider an online extension of the active regression problem: the learner receives data points one by one and immediately decides whether it should collect the corresponding labels. The goal is to efficiently maintain the regression of received data points with a small budget of label queries. We propose novel algorithms for this problem under $\\ell_p$ loss where $p\\in[1,2]$. To achieve a $(1+\\epsilon)$-approximate solution, our proposed algorithms only requires $\\tilde{\\mathcal{O}}(d/poly(\\epsilon))$ queries of labels. The numerical results verify our theoretical results and show that our methods have comparable performance with offline active regression algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22l/chen22l.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chen22l-supp.zip", "pdf_size": 459182, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4758025210608693169&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "School of Physical and Mathematical Sciences, Nanyang Technological University, Singapore; School of Physical and Mathematical Sciences, Nanyang Technological University, Singapore; School of Physical and Mathematical Sciences, Nanyang Technological University, Singapore", "aff_domain": "ntu.edu.sg; ; ", "email": "ntu.edu.sg; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22l.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanyang Technological University", "aff_unique_dep": "School of Physical and Mathematical Sciences", "aff_unique_url": "https://www.ntu.edu.sg", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Singapore", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "Online Algorithms with Multiple Predictions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16439", "id": "16439", "proceeding": "https://proceedings.mlr.press/v162/anand22a.html", "poster": "/media/PosterPDFs/ICML%202022/0dbd53751c00e0784799008b44471a77_XTEi6nq.png?t=1658340573.3208945", "slides": "/media/icml-2022/Slides/16439.pdf", "author_site": "Keerti Anand, Rong Ge, Amit Kumar, Debmalya Panigrahi", "author": "Keerti Anand; Rong Ge; Amit Kumar; Debmalya Panigrahi", "abstract": "This paper studies online algorithms augmented with", "bibtex": "@InProceedings{pmlr-v162-anand22a,\n title = \t {Online Algorithms with Multiple Predictions},\n author = {Anand, Keerti and Ge, Rong and Kumar, Amit and Panigrahi, Debmalya},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {582--598},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/anand22a/anand22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/anand22a.html},\n abstract = \t {This paper studies online algorithms augmented with", "pdf": "https://proceedings.mlr.press/v162/anand22a/anand22a.pdf", "supp": "", "pdf_size": 363756, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4637438542022827691&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, Duke University, Durham, NC, USA; Department of Computer Science, Duke University, Durham, NC, USA; Department of Computer Science and Engineering, IIT Delhi, New Delhi, India; Department of Computer Science, Duke University, Durham, NC, USA", "aff_domain": "cs.duke.edu; ; ; ", "email": "cs.duke.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/anand22a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Duke University;Indian Institute of Technology Delhi", "aff_unique_dep": "Department of Computer Science;Department of Computer Science and Engineering", "aff_unique_url": "https://www.duke.edu;https://www.iitd.ac.in", "aff_unique_abbr": "Duke;IIT Delhi", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Durham;New Delhi", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;India" }, { "title": "Online Balanced Experimental Design", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17397", "id": "17397", "proceeding": "https://proceedings.mlr.press/v162/arbour22a.html", "poster": "", "slides": "", "author_site": "David Arbour, Drew Dimmery, Tung Mai, Anup Rao", "author": "David Arbour; Drew Dimmery; Tung Mai; Anup Rao", "abstract": "We consider the experimental design problem in an online environment, an important practical task for reducing the variance of estimates in randomized experiments which allows for greater precision, and in turn, improved decision making. In this work, we present algorithms that build on recent advances in online discrepancy minimization which accommodate both arbitrary treatment probabilities and multiple treatments. The proposed algorithms are computational efficient, minimize covariate imbalance, and include randomization which enables robustness to misspecification. We provide worst case bounds on the expected mean squared error of the causal estimate and show that the proposed estimator is no worse than an implicit ridge regression, which are within a logarithmic factor of the best known results for offline experimental design. We conclude with a detailed simulation study showing favorable results relative to complete randomization as well as to offline methods for experimental design with time complexities exceeding our algorithm, which has a linear dependence on the number of observations, by polynomial factors.", "bibtex": "@InProceedings{pmlr-v162-arbour22a,\n title = \t {Online Balanced Experimental Design},\n author = {Arbour, David and Dimmery, Drew and Mai, Tung and Rao, Anup},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {844--864},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/arbour22a/arbour22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/arbour22a.html},\n abstract = \t {We consider the experimental design problem in an online environment, an important practical task for reducing the variance of estimates in randomized experiments which allows for greater precision, and in turn, improved decision making. In this work, we present algorithms that build on recent advances in online discrepancy minimization which accommodate both arbitrary treatment probabilities and multiple treatments. The proposed algorithms are computational efficient, minimize covariate imbalance, and include randomization which enables robustness to misspecification. We provide worst case bounds on the expected mean squared error of the causal estimate and show that the proposed estimator is no worse than an implicit ridge regression, which are within a logarithmic factor of the best known results for offline experimental design. We conclude with a detailed simulation study showing favorable results relative to complete randomization as well as to offline methods for experimental design with time complexities exceeding our algorithm, which has a linear dependence on the number of observations, by polynomial factors.}\n}", "pdf": "https://proceedings.mlr.press/v162/arbour22a/arbour22a.pdf", "supp": "", "pdf_size": 685390, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9578642124774969527&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Adobe Research, San Jose, CA, USA+Data Science @ University of Vienna, Vienna, AT; Data Science @ University of Vienna, Vienna, AT; Adobe Research, San Jose, CA, USA; Adobe Research, San Jose, CA, USA", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/arbour22a.html", "aff_unique_index": "0+1;1;0;0", "aff_unique_norm": "Adobe;University of Vienna", "aff_unique_dep": "Adobe Research;Data Science", "aff_unique_url": "https://research.adobe.com;https://univie.ac.at", "aff_unique_abbr": "Adobe;Uni Vienna", "aff_campus_unique_index": "0+1;1;0;0", "aff_campus_unique": "San Jose;Vienna", "aff_country_unique_index": "0+1;1;0;0", "aff_country_unique": "United States;Austria" }, { "title": "Online Continual Learning through Mutual Information Maximization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17487", "id": "17487", "proceeding": "https://proceedings.mlr.press/v162/guo22g.html", "poster": "/media/PosterPDFs/ICML%202022/8b1ecf6d8049bb062a356f1cc812e69e_2kbOZeX.png?t=1657636107.959609", "slides": "", "author_site": "Yiduo Guo, Bing Liu, Dongyan Zhao", "author": "Yiduo Guo; Bing Liu; Dongyan Zhao", "abstract": "This paper proposed a new online continual learning approach called OCM based on", "bibtex": "@InProceedings{pmlr-v162-guo22g,\n title = \t {Online Continual Learning through Mutual Information Maximization},\n author = {Guo, Yiduo and Liu, Bing and Zhao, Dongyan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8109--8126},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guo22g/guo22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/guo22g.html},\n abstract = \t {This paper proposed a new online continual learning approach called OCM based on", "pdf": "https://proceedings.mlr.press/v162/guo22g/guo22g.pdf", "supp": "", "pdf_size": 1574709, "gs_citation": 162, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8933371745723604913&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Wangxuan Institute of Computer Technology, Peking University + Artificial Intelligence Institute, Peking University; Department of Computer Science, University of Illinois at Chicago; Wangxuan Institute of Computer Technology, Peking University + Artificial Intelligence Institute, Peking University", "aff_domain": "pku.edu.cn;uic.edu;pku.edu.cn", "email": "pku.edu.cn;uic.edu;pku.edu.cn", "github": "https://github.com/gydpku/OCM", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/guo22g.html", "aff_unique_index": "0+0;1;0+0", "aff_unique_norm": "Peking University;University of Illinois at Chicago", "aff_unique_dep": "Wangxuan Institute of Computer Technology;Department of Computer Science", "aff_unique_url": "http://www.pku.edu.cn;https://www.uic.edu", "aff_unique_abbr": "PKU;UIC", "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0+0;1;0+0", "aff_country_unique": "China;United States" }, { "title": "Online Decision Transformer", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16635", "id": "16635", "proceeding": "https://proceedings.mlr.press/v162/zheng22c.html", "poster": "", "slides": "", "author_site": "Qinqing Zheng, Amy Zhang, Aditya Grover", "author": "Qinqing Zheng; Amy Zhang; Aditya Grover", "abstract": "Recent work has shown that offline reinforcement learning (RL) can be formulated as a sequence modeling problem (Chen et al., 2021; Janner et al., 2021) and solved via approaches similar to large-scale language modeling. However, any practical instantiation of RL also involves an online component, where policies pretrained on passive offline datasets are finetuned via task-specific interactions with the environment. We propose Online Decision Transformers (ODT), an RL algorithm based on sequence modeling that blends offline pretraining with online finetuning in a unified framework. Our framework uses sequence-level entropy regularizers in conjunction with autoregressive modeling objectives for sample-efficient exploration and finetuning. Empirically, we show that ODT is competitive with the state-of-the-art in absolute performance on the D4RL benchmark but shows much more significant gains during the finetuning procedure.", "bibtex": "@InProceedings{pmlr-v162-zheng22c,\n title = \t {Online Decision Transformer},\n author = {Zheng, Qinqing and Zhang, Amy and Grover, Aditya},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27042--27059},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zheng22c/zheng22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/zheng22c.html},\n abstract = \t {Recent work has shown that offline reinforcement learning (RL) can be formulated as a sequence modeling problem (Chen et al., 2021; Janner et al., 2021) and solved via approaches similar to large-scale language modeling. However, any practical instantiation of RL also involves an online component, where policies pretrained on passive offline datasets are finetuned via task-specific interactions with the environment. We propose Online Decision Transformers (ODT), an RL algorithm based on sequence modeling that blends offline pretraining with online finetuning in a unified framework. Our framework uses sequence-level entropy regularizers in conjunction with autoregressive modeling objectives for sample-efficient exploration and finetuning. Empirically, we show that ODT is competitive with the state-of-the-art in absolute performance on the D4RL benchmark but shows much more significant gains during the finetuning procedure.}\n}", "pdf": "https://proceedings.mlr.press/v162/zheng22c/zheng22c.pdf", "supp": "", "pdf_size": 1079164, "gs_citation": 276, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11549184825048973545&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Meta AI Research; University of California, Berkeley + University of California, Los Angeles; University of California, Los Angeles", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zheng22c.html", "aff_unique_index": "0;1+2;2", "aff_unique_norm": "Meta;University of California, Berkeley;University of California, Los Angeles", "aff_unique_dep": "Meta AI Research;;", "aff_unique_url": "https://meta.com;https://www.berkeley.edu;https://www.ucla.edu", "aff_unique_abbr": "Meta AI;UC Berkeley;UCLA", "aff_campus_unique_index": "1+2;2", "aff_campus_unique": ";Berkeley;Los Angeles", "aff_country_unique_index": "0;0+0;0", "aff_country_unique": "United States" }, { "title": "Online Learning and Pricing with Reusable Resources: Linear Bandits with Sub-Exponential Rewards", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15963", "id": "15963", "proceeding": "https://proceedings.mlr.press/v162/jia22c.html", "poster": "/media/PosterPDFs/ICML%202022/1a5b1e4daae265b790965a275b53ae50_FVasXuY.png?t=1657163446.6169493", "slides": "", "author_site": "Huiwen Jia, Cong Shi, Siqian Shen", "author": "Huiwen Jia; Cong Shi; Siqian Shen", "abstract": "We consider a price-based revenue management problem with reusable resources over a finite time horizon $T$. The problem finds important applications in car/bicycle rental, ridesharing, cloud computing, and hospitality management. Customers arrive following a price-dependent Poisson process and each customer requests one unit of $c$ homogeneous reusable resources. If there is an available unit, the customer gets served within a price-dependent exponentially distributed service time; otherwise, she waits in a queue until the next available unit. The decision maker assumes that the inter-arrival and service intervals have an unknown linear dependence on a $d_f$-dimensional feature vector associated with the posted price. We propose a rate-optimal online learning and pricing algorithm, termed Batch Linear Confidence Bound (BLinUCB), and prove that the cumulative regret is $\\tilde{O}( d_f \\sqrt{T } )$. In establishing the regret, we bound the transient system performance upon price changes via a coupling argument, and also generalize linear bandits to accommodate sub-exponential rewards.", "bibtex": "@InProceedings{pmlr-v162-jia22c,\n title = \t {Online Learning and Pricing with Reusable Resources: Linear Bandits with Sub-Exponential Rewards},\n author = {Jia, Huiwen and Shi, Cong and Shen, Siqian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10135--10160},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jia22c/jia22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/jia22c.html},\n abstract = \t {We consider a price-based revenue management problem with reusable resources over a finite time horizon $T$. The problem finds important applications in car/bicycle rental, ridesharing, cloud computing, and hospitality management. Customers arrive following a price-dependent Poisson process and each customer requests one unit of $c$ homogeneous reusable resources. If there is an available unit, the customer gets served within a price-dependent exponentially distributed service time; otherwise, she waits in a queue until the next available unit. The decision maker assumes that the inter-arrival and service intervals have an unknown linear dependence on a $d_f$-dimensional feature vector associated with the posted price. We propose a rate-optimal online learning and pricing algorithm, termed Batch Linear Confidence Bound (BLinUCB), and prove that the cumulative regret is $\\tilde{O}( d_f \\sqrt{T } )$. In establishing the regret, we bound the transient system performance upon price changes via a coupling argument, and also generalize linear bandits to accommodate sub-exponential rewards.}\n}", "pdf": "https://proceedings.mlr.press/v162/jia22c/jia22c.pdf", "supp": "", "pdf_size": 755840, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8334157155664625639&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Industrial and Operations Engineering, University of Michigan, Ann Arbor, MI 48109; Department of Industrial and Operations Engineering, University of Michigan, Ann Arbor, MI 48109; Department of Industrial and Operations Engineering, University of Michigan, Ann Arbor, MI 48109", "aff_domain": "umich.edu;umich.edu;umich.edu", "email": "umich.edu;umich.edu;umich.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/jia22c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "Department of Industrial and Operations Engineering", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Ann Arbor", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Online Learning for Min Sum Set Cover and Pandora\u2019s Box", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17417", "id": "17417", "proceeding": "https://proceedings.mlr.press/v162/gergatsouli22a.html", "poster": "/media/PosterPDFs/ICML%202022/3c69ccff8acc065aa5618b36b74703de.png?t=1657522090.8349292", "slides": "", "author_site": "Evangelia Gergatsouli, Christos Tzamos", "author": "Evangelia Gergatsouli; Christos Tzamos", "abstract": "Two central problems in Stochastic Optimization are Min-Sum Set Cover and Pandora\u2019s Box. In Pandora\u2019s Box, we are presented with n boxes, each containing an unknown value and the goal is to open the boxes in some order to minimize the sum of the search cost and the smallest value found. Given a distribution of value vectors, we are asked to identify a near-optimal search order. Min-Sum Set Cover corresponds to the case where values are either 0 or infinity. In this work, we study the case where the value vectors are not drawn from a distribution but are presented to a learner in an online fashion. We present a computationally efficient algorithm that is constant-competitive against the cost of the optimal search order. We extend our results to a bandit setting where only the values of the boxes opened are revealed to the learner after every round. We also generalize our results to other commonly studied variants of Pandora\u2019s Box and Min-Sum Set Cover that involve selecting more than a single value subject to a matroid constraint.", "bibtex": "@InProceedings{pmlr-v162-gergatsouli22a,\n title = \t {Online Learning for Min Sum Set Cover and Pandora\u2019s Box},\n author = {Gergatsouli, Evangelia and Tzamos, Christos},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7382--7403},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gergatsouli22a/gergatsouli22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gergatsouli22a.html},\n abstract = \t {Two central problems in Stochastic Optimization are Min-Sum Set Cover and Pandora\u2019s Box. In Pandora\u2019s Box, we are presented with n boxes, each containing an unknown value and the goal is to open the boxes in some order to minimize the sum of the search cost and the smallest value found. Given a distribution of value vectors, we are asked to identify a near-optimal search order. Min-Sum Set Cover corresponds to the case where values are either 0 or infinity. In this work, we study the case where the value vectors are not drawn from a distribution but are presented to a learner in an online fashion. We present a computationally efficient algorithm that is constant-competitive against the cost of the optimal search order. We extend our results to a bandit setting where only the values of the boxes opened are revealed to the learner after every round. We also generalize our results to other commonly studied variants of Pandora\u2019s Box and Min-Sum Set Cover that involve selecting more than a single value subject to a matroid constraint.}\n}", "pdf": "https://proceedings.mlr.press/v162/gergatsouli22a/gergatsouli22a.pdf", "supp": "", "pdf_size": 426612, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2040023270500753317&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer Sciences, University of Wisconsin-Madison, Madison, WI, USA; Department of Computer Sciences, University of Wisconsin-Madison, Madison, WI, USA", "aff_domain": "cs.wisc.edu;wisc.edu", "email": "cs.wisc.edu;wisc.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/gergatsouli22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "Department of Computer Sciences", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Online Learning with Knapsacks: the Best of Both Worlds", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17349", "id": "17349", "proceeding": "https://proceedings.mlr.press/v162/castiglioni22a.html", "poster": "/media/PosterPDFs/ICML%202022/83cdcec08fbf90370fcf53bdd56604ff.png?t=1657632298.1263587", "slides": "", "author_site": "Matteo Castiglioni, Andrea Celli, Christian Kroer", "author": "Matteo Castiglioni; Andrea Celli; Christian Kroer", "abstract": "We study online learning problems in which a decision maker wants to maximize their expected reward without violating a finite set of $m$ resource constraints. By casting the learning process over a suitably defined space of strategy mixtures, we recover strong duality on a Lagrangian relaxation of the underlying optimization problem, even for general settings with non-convex reward and resource-consumption functions. Then, we provide the first best-of-both-worlds type framework for this setting, with no-regret guarantees both under stochastic and adversarial inputs. Our framework yields the same regret guarantees of prior work in the stochastic case. On the other hand, when budgets grow at least linearly in the time horizon, it allows us to provide a constant competitive ratio in the adversarial case, which improves over the $O(m \\log T)$ competitive ratio of Immorlica et al. [FOCS\u201919]. Moreover, our framework allows the decision maker to handle non-convex reward and cost functions. We provide two game-theoretic applications of our framework to give further evidence of its flexibility.", "bibtex": "@InProceedings{pmlr-v162-castiglioni22a,\n title = \t {Online Learning with Knapsacks: the Best of Both Worlds},\n author = {Castiglioni, Matteo and Celli, Andrea and Kroer, Christian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2767--2783},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/castiglioni22a/castiglioni22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/castiglioni22a.html},\n abstract = \t {We study online learning problems in which a decision maker wants to maximize their expected reward without violating a finite set of $m$ resource constraints. By casting the learning process over a suitably defined space of strategy mixtures, we recover strong duality on a Lagrangian relaxation of the underlying optimization problem, even for general settings with non-convex reward and resource-consumption functions. Then, we provide the first best-of-both-worlds type framework for this setting, with no-regret guarantees both under stochastic and adversarial inputs. Our framework yields the same regret guarantees of prior work in the stochastic case. On the other hand, when budgets grow at least linearly in the time horizon, it allows us to provide a constant competitive ratio in the adversarial case, which improves over the $O(m \\log T)$ competitive ratio of Immorlica et al. [FOCS\u201919]. Moreover, our framework allows the decision maker to handle non-convex reward and cost functions. We provide two game-theoretic applications of our framework to give further evidence of its flexibility.}\n}", "pdf": "https://proceedings.mlr.press/v162/castiglioni22a/castiglioni22a.pdf", "supp": "", "pdf_size": 375309, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7589088186742760999&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "DEIB, Politecnico di Milano, Milan, Italy; Department of Computing Sciences, Bocconi University, Milan, Italy; IEOR Department, Columbia University, New York, NY", "aff_domain": "polimi.it;unibocconi.it;columbia.edu", "email": "polimi.it;unibocconi.it;columbia.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/castiglioni22a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Politecnico di Milano;Bocconi University;Columbia University", "aff_unique_dep": "DEIB;Department of Computing Sciences;IEOR Department", "aff_unique_url": "https://www.polimi.it;https://www.bocconi.edu;https://www.columbia.edu", "aff_unique_abbr": "Politecnico di Milano;Bocconi;Columbia", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Milan;New York", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Italy;United States" }, { "title": "Online Nonsubmodular Minimization with Delayed Costs: From Full Information to Bandit Feedback", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17751", "id": "17751", "proceeding": "https://proceedings.mlr.press/v162/lin22g.html", "poster": "/media/PosterPDFs/ICML%202022/204da255aea2cd4a75ace6018fad6b4d_Trm2koO.png?t=1657519312.3562088", "slides": "", "author_site": "Tianyi Lin, Aldo Pacchiano, Yaodong Yu, Michael Jordan", "author": "Tianyi Lin; Aldo Pacchiano; Yaodong Yu; Michael Jordan", "abstract": "Motivated by applications to online learning in sparse estimation and Bayesian optimization, we consider the problem of online unconstrained nonsubmodular minimization with delayed costs in both full information and bandit feedback settings. In contrast to previous works on online unconstrained submodular minimization, we focus on a class of nonsubmodular functions with special structure, and prove regret guarantees for several variants of the online and approximate online bandit gradient descent algorithms in static and delayed scenarios. We derive bounds for the agent\u2019s regret in the full information and bandit feedback setting, even if the delay between choosing a decision and receiving the incurred cost is unbounded. Key to our approach is the notion of $(\\alpha, \\beta)$-regret and the extension of the generic convex relaxation model from\u00a0\\citet{El-2020-Optimal}, the analysis of which is of independent interest. We conduct and showcase several simulation studies to demonstrate the efficacy of our algorithms.", "bibtex": "@InProceedings{pmlr-v162-lin22g,\n title = \t {Online Nonsubmodular Minimization with Delayed Costs: From Full Information to Bandit Feedback},\n author = {Lin, Tianyi and Pacchiano, Aldo and Yu, Yaodong and Jordan, Michael},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13441--13467},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lin22g/lin22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/lin22g.html},\n abstract = \t {Motivated by applications to online learning in sparse estimation and Bayesian optimization, we consider the problem of online unconstrained nonsubmodular minimization with delayed costs in both full information and bandit feedback settings. In contrast to previous works on online unconstrained submodular minimization, we focus on a class of nonsubmodular functions with special structure, and prove regret guarantees for several variants of the online and approximate online bandit gradient descent algorithms in static and delayed scenarios. We derive bounds for the agent\u2019s regret in the full information and bandit feedback setting, even if the delay between choosing a decision and receiving the incurred cost is unbounded. Key to our approach is the notion of $(\\alpha, \\beta)$-regret and the extension of the generic convex relaxation model from\u00a0\\citet{El-2020-Optimal}, the analysis of which is of independent interest. We conduct and showcase several simulation studies to demonstrate the efficacy of our algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/lin22g/lin22g.pdf", "supp": "", "pdf_size": 617302, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13213174914212174761&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Electrical Engineering and Computer Science, UC Berkeley; Microsoft Research, NYC; Department of Electrical Engineering and Computer Science, UC Berkeley; Department of Statistics, UC Berkeley", "aff_domain": "berkeley.edu; ; ; ", "email": "berkeley.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lin22g.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of California, Berkeley;Microsoft", "aff_unique_dep": "Department of Electrical Engineering and Computer Science;Research", "aff_unique_url": "https://www.berkeley.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UC Berkeley;MSR", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Berkeley;New York City", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Online and Consistent Correlation Clustering", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17637", "id": "17637", "proceeding": "https://proceedings.mlr.press/v162/cohen-addad22a.html", "poster": "", "slides": "", "author_site": "Vincent Cohen-Addad, Silvio Lattanzi, Andreas Maggiori, Nikos Parotsidis", "author": "Vincent Cohen-Addad; Silvio Lattanzi; Andreas Maggiori; Nikos Parotsidis", "abstract": "In the correlation clustering problem the input is a signed graph where the sign indicates whether each pair of points should be placed in the same cluster or not. The goal of the problem is to compute a clustering which minimizes the number of disagreements with such recommendation. Thanks to its many practical applications, correlation clustering is a fundamental unsupervised learning problem and has been extensively studied in many different settings. In this paper we study the problem in the classic online setting with recourse; The vertices of the graphs arrive in an online manner and the goal is to maintain an approximate clustering while minimizing the number of times each vertex changes cluster. Our main contribution is an algorithm that achieves logarithmic recourse per vertex in the worst case. We also complement this result with a tight lower bound. Finally we show experimentally that our algorithm achieves better performances than state-of-the-art algorithms on real world data.", "bibtex": "@InProceedings{pmlr-v162-cohen-addad22a,\n title = \t {Online and Consistent Correlation Clustering},\n author = {Cohen-Addad, Vincent and Lattanzi, Silvio and Maggiori, Andreas and Parotsidis, Nikos},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4157--4179},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cohen-addad22a/cohen-addad22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/cohen-addad22a.html},\n abstract = \t {In the correlation clustering problem the input is a signed graph where the sign indicates whether each pair of points should be placed in the same cluster or not. The goal of the problem is to compute a clustering which minimizes the number of disagreements with such recommendation. Thanks to its many practical applications, correlation clustering is a fundamental unsupervised learning problem and has been extensively studied in many different settings. In this paper we study the problem in the classic online setting with recourse; The vertices of the graphs arrive in an online manner and the goal is to maintain an approximate clustering while minimizing the number of times each vertex changes cluster. Our main contribution is an algorithm that achieves logarithmic recourse per vertex in the worst case. We also complement this result with a tight lower bound. Finally we show experimentally that our algorithm achieves better performances than state-of-the-art algorithms on real world data.}\n}", "pdf": "https://proceedings.mlr.press/v162/cohen-addad22a/cohen-addad22a.pdf", "supp": "", "pdf_size": 734511, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4195174394843237429&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Google; Google; EPFL, Lausanne, Switzerland; Google", "aff_domain": "google.com;google.com;epfl.ch;google.com", "email": "google.com;google.com;epfl.ch;google.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/cohen-addad22a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Google;EPFL", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.epfl.ch", "aff_unique_abbr": "Google;EPFL", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Mountain View;Lausanne", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Switzerland" }, { "title": "Only tails matter: Average-Case Universality and Robustness in the Convex Regime", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17505", "id": "17505", "proceeding": "https://proceedings.mlr.press/v162/cunha22a.html", "poster": "", "slides": "", "author_site": "LEONARDO CUNHA, Gauthier Gidel, Fabian Pedregosa, Damien Scieur, Courtney Paquette", "author": "Leonardo Cunha; Gauthier Gidel; Fabian Pedregosa; Damien Scieur; Courtney Paquette", "abstract": "The recently developed average-case analysis of optimization methods allows a more fine-grained and representative convergence analysis than usual worst-case results. In exchange, this analysis requires a more precise hypothesis over the data generating process, namely assuming knowledge of the expected spectral distribution (ESD) of the random matrix associated with the problem. This work shows that the concentration of eigenvalues near the edges of the ESD determines a problem\u2019s asymptotic average complexity. This a priori information on this concentration is a more grounded assumption than complete knowledge of the ESD. This approximate concentration is effectively a middle ground between the coarseness of the worst-case scenario convergence and the restrictive previous average-case analysis. We also introduce the Generalized Chebyshev method, asymptotically optimal under a hypothesis on this concentration and globally optimal when the ESD follows a Beta distribution. We compare its performance to classical optimization algorithms, such as gradient descent or Nesterov\u2019s scheme, and we show that, in the average-case context, Nesterov\u2019s method is universally nearly optimal asymptotically.", "bibtex": "@InProceedings{pmlr-v162-cunha22a,\n title = \t {Only tails matter: Average-Case Universality and Robustness in the Convex Regime},\n author = {Cunha, Leonardo and Gidel, Gauthier and Pedregosa, Fabian and Scieur, Damien and Paquette, Courtney},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4474--4491},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cunha22a/cunha22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/cunha22a.html},\n abstract = \t {The recently developed average-case analysis of optimization methods allows a more fine-grained and representative convergence analysis than usual worst-case results. In exchange, this analysis requires a more precise hypothesis over the data generating process, namely assuming knowledge of the expected spectral distribution (ESD) of the random matrix associated with the problem. This work shows that the concentration of eigenvalues near the edges of the ESD determines a problem\u2019s asymptotic average complexity. This a priori information on this concentration is a more grounded assumption than complete knowledge of the ESD. This approximate concentration is effectively a middle ground between the coarseness of the worst-case scenario convergence and the restrictive previous average-case analysis. We also introduce the Generalized Chebyshev method, asymptotically optimal under a hypothesis on this concentration and globally optimal when the ESD follows a Beta distribution. We compare its performance to classical optimization algorithms, such as gradient descent or Nesterov\u2019s scheme, and we show that, in the average-case context, Nesterov\u2019s method is universally nearly optimal asymptotically.}\n}", "pdf": "https://proceedings.mlr.press/v162/cunha22a/cunha22a.pdf", "supp": "", "pdf_size": 1882516, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=936811545281060495&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "MILA and DIRO, Universit\u00e9 de Montreal, Montreal, Canada; MILA and DIRO, Universit\u00e9 de Montreal, Montreal, Canada+Canada CIFAR AI Chair; Google Research; Samsung SAIT AI Lab, Montreal, Canada; McGill University, Montreal, Canada", "aff_domain": "gmail.com; ; ; ; ", "email": "gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/cunha22a.html", "aff_unique_index": "0;0+1;2;3;4", "aff_unique_norm": "Universit\u00e9 de Montreal;Canadian Institute for Advanced Research;Google;Samsung;McGill University", "aff_unique_dep": "MILA and DIRO;AI Chair;Google Research;AI Lab;", "aff_unique_url": "https://www.umontreal.ca;https://www.cifar.ca;https://research.google;https://www.sait.samsung.com;https://www.mcgill.ca", "aff_unique_abbr": "UM;CIFAR;Google Research;SAIT;McGill", "aff_campus_unique_index": "0;0;2;0;0", "aff_campus_unique": "Montreal;;Mountain View", "aff_country_unique_index": "0;0+0;1;0;0", "aff_country_unique": "Canada;United States" }, { "title": "Open-Sampling: Exploring Out-of-Distribution data for Re-balancing Long-tailed datasets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17129", "id": "17129", "proceeding": "https://proceedings.mlr.press/v162/wei22c.html", "poster": "/media/PosterPDFs/ICML%202022/7fd4db88d31ab524e0afe153c4f9465a.png?t=1657714685.234391", "slides": "", "author_site": "Hongxin Wei, Lue Tao, RENCHUNZI XIE, LEI FENG, Bo An", "author": "Hongxin Wei; Lue Tao; Renchunzi Xie; Lei Feng; Bo An", "abstract": "Deep neural networks usually perform poorly when the training dataset suffers from extreme class imbalance. Recent studies found that directly training with out-of-distribution data (i.e., open-set samples) in a semi-supervised manner would harm the generalization performance. In this work, we theoretically show that out-of-distribution data can still be leveraged to augment the minority classes from a Bayesian perspective. Based on this motivation, we propose a novel method called Open-sampling, which utilizes open-set noisy labels to re-balance the class priors of the training dataset. For each open-set instance, the label is sampled from our pre-defined distribution that is complementary to the distribution of original class priors. We empirically show that Open-sampling not only re-balances the class priors but also encourages the neural network to learn separable representations. Extensive experiments demonstrate that our proposed method significantly outperforms existing data re-balancing methods and can boost the performance of existing state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v162-wei22c,\n title = \t {Open-Sampling: Exploring Out-of-Distribution data for Re-balancing Long-tailed datasets},\n author = {Wei, Hongxin and Tao, Lue and Xie, Renchunzi and Feng, Lei and An, Bo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23615--23630},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wei22c/wei22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/wei22c.html},\n abstract = \t {Deep neural networks usually perform poorly when the training dataset suffers from extreme class imbalance. Recent studies found that directly training with out-of-distribution data (i.e., open-set samples) in a semi-supervised manner would harm the generalization performance. In this work, we theoretically show that out-of-distribution data can still be leveraged to augment the minority classes from a Bayesian perspective. Based on this motivation, we propose a novel method called Open-sampling, which utilizes open-set noisy labels to re-balance the class priors of the training dataset. For each open-set instance, the label is sampled from our pre-defined distribution that is complementary to the distribution of original class priors. We empirically show that Open-sampling not only re-balances the class priors but also encourages the neural network to learn separable representations. Extensive experiments demonstrate that our proposed method significantly outperforms existing data re-balancing methods and can boost the performance of existing state-of-the-art methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/wei22c/wei22c.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wei22c-supp.zip", "pdf_size": 1617047, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7341721827370829232&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Nanyang Technological University, Singapore; Nanjing University of Aeronautics and Astronautics, Nanjing, Jiangsu, China; Nanyang Technological University, Singapore + Chongqing University, Chongqing, China; Chongqing University, Chongqing, China; Nanyang Technological University, Singapore", "aff_domain": "ntu.edu.sg;nuaa.edu.cn;e.ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "email": "ntu.edu.sg;nuaa.edu.cn;e.ntu.edu.sg;ntu.edu.sg;ntu.edu.sg", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/wei22c.html", "aff_unique_index": "0;1;0+2;2;0", "aff_unique_norm": "Nanyang Technological University;Nanjing University of Aeronautics and Astronautics;Chongqing University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ntu.edu.sg;http://www.nuaa.edu.cn;http://www.cqu.edu.cn/", "aff_unique_abbr": "NTU;NUAA;CQU", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Nanjing;Chongqing", "aff_country_unique_index": "0;1;0+1;1;0", "aff_country_unique": "Singapore;China" }, { "title": "Optimal Algorithms for Mean Estimation under Local Differential Privacy", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16545", "id": "16545", "proceeding": "https://proceedings.mlr.press/v162/asi22b.html", "poster": "/media/PosterPDFs/ICML%202022/1fd09c5f59a8ff35d499c0ee25a1d47e.png?t=1658168498.3217654", "slides": "", "author_site": "Hilal Asi, Vitaly Feldman, Kunal Talwar", "author": "Hilal Asi; Vitaly Feldman; Kunal Talwar", "abstract": "We study the problem of mean estimation of $\\ell_2$-bounded vectors under the constraint of local differential privacy. While the literature has a variety of algorithms that achieve the (asymptotic) optimal rates for this problem, the performance of these algorithms in practice can vary significantly due to varying (and often large) hidden constants. In this work, we investigate the question of designing the randomizer with the smallest variance. We show that PrivUnit (Bhowmick et al. 2018) with optimized parameters achieves the optimal variance among a large family of natural randomizers. To prove this result, we establish some properties of local randomizers, and use symmetrization arguments that allow us to write the optimal randomizer as the optimizer of a certain linear program. These structural results, which should extend to other problems, then allow us to show that the optimal randomizer belongs to the PrivUnit family. We also develop a new variant of PrivUnit based on the Gaussian distribution which is more amenable to mathematical analysis and enjoys the same optimality guarantees. This allows us to establish several useful properties on the exact constants of the optimal error as well as to numerically estimate these constants.", "bibtex": "@InProceedings{pmlr-v162-asi22b,\n title = \t {Optimal Algorithms for Mean Estimation under Local Differential Privacy},\n author = {Asi, Hilal and Feldman, Vitaly and Talwar, Kunal},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1046--1056},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/asi22b/asi22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/asi22b.html},\n abstract = \t {We study the problem of mean estimation of $\\ell_2$-bounded vectors under the constraint of local differential privacy. While the literature has a variety of algorithms that achieve the (asymptotic) optimal rates for this problem, the performance of these algorithms in practice can vary significantly due to varying (and often large) hidden constants. In this work, we investigate the question of designing the randomizer with the smallest variance. We show that PrivUnit (Bhowmick et al. 2018) with optimized parameters achieves the optimal variance among a large family of natural randomizers. To prove this result, we establish some properties of local randomizers, and use symmetrization arguments that allow us to write the optimal randomizer as the optimizer of a certain linear program. These structural results, which should extend to other problems, then allow us to show that the optimal randomizer belongs to the PrivUnit family. We also develop a new variant of PrivUnit based on the Gaussian distribution which is more amenable to mathematical analysis and enjoys the same optimality guarantees. This allows us to establish several useful properties on the exact constants of the optimal error as well as to numerically estimate these constants.}\n}", "pdf": "https://proceedings.mlr.press/v162/asi22b/asi22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/asi22b-supp.zip", "pdf_size": 381231, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1735044226681395764&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Stanford University + Apple; Apple; Apple", "aff_domain": "stanford.edu;gmail.com;apple.com", "email": "stanford.edu;gmail.com;apple.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/asi22b.html", "aff_unique_index": "0+1;1;1", "aff_unique_norm": "Stanford University;Apple", "aff_unique_dep": ";Apple Inc.", "aff_unique_url": "https://www.stanford.edu;https://www.apple.com", "aff_unique_abbr": "Stanford;Apple", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "Optimal Algorithms for Stochastic Multi-Level Compositional Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16851", "id": "16851", "proceeding": "https://proceedings.mlr.press/v162/jiang22c.html", "poster": "/media/PosterPDFs/ICML%202022/47267ca39f652c0de27a4b27c5e11c40.png?t=1656775675.8691657", "slides": "", "author_site": "Wei Jiang, Bokun Wang, Yibo Wang, Lijun Zhang, Tianbao Yang", "author": "Wei Jiang; Bokun Wang; Yibo Wang; Lijun Zhang; Tianbao Yang", "abstract": "In this paper, we investigate the problem of stochastic multi-level compositional optimization, where the objective function is a composition of multiple smooth but possibly non-convex functions. Existing methods for solving this problem either suffer from sub-optimal sample complexities or need a huge batch size. To address this limitation, we propose a Stochastic Multi-level Variance Reduction method (SMVR), which achieves the optimal sample complexity of $\\mathcal{O}\\left(1 / \\epsilon^{3}\\right)$ to find an $\\epsilon$-stationary point for non-convex objectives. Furthermore, when the objective function satisfies the convexity or Polyak-{\u0141}ojasiewicz (PL) condition, we propose a stage-wise variant of SMVR and improve the sample complexity to $\\mathcal{O}\\left(1 / \\epsilon^{2}\\right)$ for convex functions or $\\mathcal{O}\\left(1 /(\\mu\\epsilon)\\right)$ for non-convex functions satisfying the $\\mu$-PL condition. The latter result implies the same complexity for $\\mu$-strongly convex functions. To make use of adaptive learning rates, we also develop Adaptive SMVR, which achieves the same optimal complexities but converges faster in practice. All our complexities match the lower bounds not only in terms of $\\epsilon$ but also in terms of $\\mu$ (for PL or strongly convex functions), without using a large batch size in each iteration.", "bibtex": "@InProceedings{pmlr-v162-jiang22c,\n title = \t {Optimal Algorithms for Stochastic Multi-Level Compositional Optimization},\n author = {Jiang, Wei and Wang, Bokun and Wang, Yibo and Zhang, Lijun and Yang, Tianbao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10195--10216},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jiang22c/jiang22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/jiang22c.html},\n abstract = \t {In this paper, we investigate the problem of stochastic multi-level compositional optimization, where the objective function is a composition of multiple smooth but possibly non-convex functions. Existing methods for solving this problem either suffer from sub-optimal sample complexities or need a huge batch size. To address this limitation, we propose a Stochastic Multi-level Variance Reduction method (SMVR), which achieves the optimal sample complexity of $\\mathcal{O}\\left(1 / \\epsilon^{3}\\right)$ to find an $\\epsilon$-stationary point for non-convex objectives. Furthermore, when the objective function satisfies the convexity or Polyak-{\u0141}ojasiewicz (PL) condition, we propose a stage-wise variant of SMVR and improve the sample complexity to $\\mathcal{O}\\left(1 / \\epsilon^{2}\\right)$ for convex functions or $\\mathcal{O}\\left(1 /(\\mu\\epsilon)\\right)$ for non-convex functions satisfying the $\\mu$-PL condition. The latter result implies the same complexity for $\\mu$-strongly convex functions. To make use of adaptive learning rates, we also develop Adaptive SMVR, which achieves the same optimal complexities but converges faster in practice. All our complexities match the lower bounds not only in terms of $\\epsilon$ but also in terms of $\\mu$ (for PL or strongly convex functions), without using a large batch size in each iteration.}\n}", "pdf": "https://proceedings.mlr.press/v162/jiang22c/jiang22c.pdf", "supp": "", "pdf_size": 766705, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3581531998097273584&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China; Department of Computer Science, The University of Iowa, Iowa City, USA; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China; Department of Computer Science, The University of Iowa, Iowa City, USA", "aff_domain": "lamda.nju.edu.cn;uiowa.edu; ; ;uiowa.edu", "email": "lamda.nju.edu.cn;uiowa.edu; ; ;uiowa.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/jiang22c.html", "aff_unique_index": "0;1;0;0;1", "aff_unique_norm": "Nanjing University;University of Iowa", "aff_unique_dep": "National Key Laboratory for Novel Software Technology;Department of Computer Science", "aff_unique_url": "http://www.nju.edu.cn;https://www.uiowa.edu", "aff_unique_abbr": "Nanjing U;UIowa", "aff_campus_unique_index": "0;1;0;0;1", "aff_campus_unique": "Nanjing;Iowa City", "aff_country_unique_index": "0;1;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Optimal Clipping and Magnitude-aware Differentiation for Improved Quantization-aware Training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16437", "id": "16437", "proceeding": "https://proceedings.mlr.press/v162/sakr22a.html", "poster": "/media/PosterPDFs/ICML%202022/cc06a6150b92e17dd3076a0f0f9d2af4.png?t=1657832984.564331", "slides": "", "author_site": "Charbel Sakr, Steve Dai, Rangha Venkatesan, Brian Zimmer, William Dally, Brucek Khailany", "author": "Charbel Sakr; Steve Dai; Rangha Venkatesan; Brian Zimmer; William Dally; Brucek Khailany", "abstract": "Data clipping is crucial in reducing noise in quantization operations and improving the achievable accuracy of quantization-aware training (QAT). Current practices rely on heuristics to set clipping threshold scalars and cannot be shown to be optimal. We propose Optimally Clipped Tensors And Vectors (OCTAV), a recursive algorithm to determine MSE-optimal clipping scalars. Derived from the fast Newton-Raphson method, OCTAV finds optimal clipping scalars on the fly, for every tensor, at every iteration of the QAT routine. Thus, the QAT algorithm is formulated with provably minimum quantization noise at each step. In addition, we reveal limitations in common gradient estimation techniques in QAT and propose magnitude-aware differentiation as a remedy to further improve accuracy. Experimentally, OCTAV-enabled QAT achieves state-of-the-art accuracy on multiple tasks. These include training-from-scratch and retraining ResNets and MobileNets on ImageNet, and Squad fine-tuning using BERT models, where OCTAV-enabled QAT consistently preserves accuracy at low precision (4-to-6-bits). Our results require no modifications to the baseline training recipe, except for the insertion of quantization operations where appropriate.", "bibtex": "@InProceedings{pmlr-v162-sakr22a,\n title = \t {Optimal Clipping and Magnitude-aware Differentiation for Improved Quantization-aware Training},\n author = {Sakr, Charbel and Dai, Steve and Venkatesan, Rangha and Zimmer, Brian and Dally, William and Khailany, Brucek},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19123--19138},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sakr22a/sakr22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sakr22a.html},\n abstract = \t {Data clipping is crucial in reducing noise in quantization operations and improving the achievable accuracy of quantization-aware training (QAT). Current practices rely on heuristics to set clipping threshold scalars and cannot be shown to be optimal. We propose Optimally Clipped Tensors And Vectors (OCTAV), a recursive algorithm to determine MSE-optimal clipping scalars. Derived from the fast Newton-Raphson method, OCTAV finds optimal clipping scalars on the fly, for every tensor, at every iteration of the QAT routine. Thus, the QAT algorithm is formulated with provably minimum quantization noise at each step. In addition, we reveal limitations in common gradient estimation techniques in QAT and propose magnitude-aware differentiation as a remedy to further improve accuracy. Experimentally, OCTAV-enabled QAT achieves state-of-the-art accuracy on multiple tasks. These include training-from-scratch and retraining ResNets and MobileNets on ImageNet, and Squad fine-tuning using BERT models, where OCTAV-enabled QAT consistently preserves accuracy at low precision (4-to-6-bits). Our results require no modifications to the baseline training recipe, except for the insertion of quantization operations where appropriate.}\n}", "pdf": "https://proceedings.mlr.press/v162/sakr22a/sakr22a.pdf", "supp": "", "pdf_size": 849760, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5094761893107363193&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "NVIDIA Corporation, Santa Clara, CA 95051 USA; NVIDIA Corporation, Santa Clara, CA 95051 USA; NVIDIA Corporation, Santa Clara, CA 95051 USA; NVIDIA Corporation, Santa Clara, CA 95051 USA; NVIDIA Corporation, Santa Clara, CA 95051 USA; NVIDIA Corporation, Santa Clara, CA 95051 USA", "aff_domain": "nvidia.com; ; ; ; ; ", "email": "nvidia.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/sakr22a.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "NVIDIA", "aff_unique_dep": "NVIDIA Corporation", "aff_unique_url": "https://www.nvidia.com", "aff_unique_abbr": "NVIDIA", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Santa Clara", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Optimal Clustering with Noisy Queries via Multi-Armed Bandit", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17767", "id": "17767", "proceeding": "https://proceedings.mlr.press/v162/xia22a.html", "poster": "/media/PosterPDFs/ICML%202022/921c2dc40d0b979c2910298d2f880152.png?t=1657584255.6376367", "slides": "", "author_site": "Jinghui Xia, Zengfeng Huang", "author": "Jinghui Xia; Zengfeng Huang", "abstract": "Motivated by many applications, we study clustering with a faulty oracle. In this problem, there are $n$ items belonging to $k$ unknown clusters, and the algorithm is allowed to ask the oracle whether two items belong to the same cluster or not. However, the answer from the oracle is correct only with probability $\\frac{1}{2}+\\frac{\\delta}{2}$. The goal is to recover the hidden clusters with minimum number of noisy queries. Previous works have shown that the problem can be solved with $O(\\frac{nk\\log n}{\\delta^2} + \\text{poly}(k,\\frac{1}{\\delta}, \\log n))$ queries, while $\\Omega(\\frac{nk}{\\delta^2})$ queries is known to be necessary. So, for any values of $k$ and $\\delta$, there is still a non-trivial gap between upper and lower bounds. In this work, we obtain the first matching upper and lower bounds for a wide range of parameters. In particular, a new polynomial time algorithm with $O(\\frac{n(k+\\log n)}{\\delta^2} + \\text{poly}(k,\\frac{1}{\\delta}, \\log n))$ queries is proposed. Moreover, we prove a new lower bound of $\\Omega(\\frac{n\\log n}{\\delta^2})$, which, combined with the existing $\\Omega(\\frac{nk}{\\delta^2})$ bound, matches our upper bound up to an additive $\\text{poly}(k,\\frac{1}{\\delta},\\log n)$ term. To obtain the new results, our main ingredient is an interesting connection between our problem and multi-armed bandit, which might provide useful insights for other similar problems.", "bibtex": "@InProceedings{pmlr-v162-xia22a,\n title = \t {Optimal Clustering with Noisy Queries via Multi-Armed Bandit},\n author = {Xia, Jinghui and Huang, Zengfeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24315--24331},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xia22a/xia22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/xia22a.html},\n abstract = \t {Motivated by many applications, we study clustering with a faulty oracle. In this problem, there are $n$ items belonging to $k$ unknown clusters, and the algorithm is allowed to ask the oracle whether two items belong to the same cluster or not. However, the answer from the oracle is correct only with probability $\\frac{1}{2}+\\frac{\\delta}{2}$. The goal is to recover the hidden clusters with minimum number of noisy queries. Previous works have shown that the problem can be solved with $O(\\frac{nk\\log n}{\\delta^2} + \\text{poly}(k,\\frac{1}{\\delta}, \\log n))$ queries, while $\\Omega(\\frac{nk}{\\delta^2})$ queries is known to be necessary. So, for any values of $k$ and $\\delta$, there is still a non-trivial gap between upper and lower bounds. In this work, we obtain the first matching upper and lower bounds for a wide range of parameters. In particular, a new polynomial time algorithm with $O(\\frac{n(k+\\log n)}{\\delta^2} + \\text{poly}(k,\\frac{1}{\\delta}, \\log n))$ queries is proposed. Moreover, we prove a new lower bound of $\\Omega(\\frac{n\\log n}{\\delta^2})$, which, combined with the existing $\\Omega(\\frac{nk}{\\delta^2})$ bound, matches our upper bound up to an additive $\\text{poly}(k,\\frac{1}{\\delta},\\log n)$ term. To obtain the new results, our main ingredient is an interesting connection between our problem and multi-armed bandit, which might provide useful insights for other similar problems.}\n}", "pdf": "https://proceedings.mlr.press/v162/xia22a/xia22a.pdf", "supp": "", "pdf_size": 358519, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=171020587293183367&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "School of Data Science, Fudan University, Shanghai, China; Shanghai Key Lab of Intelligent Information Processing", "aff_domain": "fudan.edu.cn;fudan.edu.cn", "email": "fudan.edu.cn;fudan.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/xia22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Fudan University;Shanghai Key Lab of Intelligent Information Processing", "aff_unique_dep": "School of Data Science;Intelligent Information Processing", "aff_unique_url": "https://www.fudan.edu.cn;", "aff_unique_abbr": "Fudan;", "aff_campus_unique_index": "0", "aff_campus_unique": "Shanghai;", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Optimal Estimation of Policy Gradient via Double Fitted Iteration", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17155", "id": "17155", "proceeding": "https://proceedings.mlr.press/v162/ni22b.html", "poster": "", "slides": "", "author_site": "Chengzhuo Ni, Ruiqi Zhang, Xiang Ji, Xuezhou Zhang, Mengdi Wang", "author": "Chengzhuo Ni; Ruiqi Zhang; Xiang Ji; Xuezhou Zhang; Mengdi Wang", "abstract": "Policy gradient (PG) estimation becomes a challenge when we are not allowed to sample with the target policy but only have access to a dataset generated by some unknown behavior policy. Conventional methods for off-policy PG estimation often suffer from either significant bias or exponentially large variance. In this paper, we propose the double Fitted PG estimation (FPG) algorithm. FPG can work with an arbitrary policy parameterization, assuming access to a Bellman-complete value function class. In the case of linear value function approximation, we provide a tight finite-sample upper bound on policy gradient estimation error, that is governed by the amount of distribution mismatch measured in feature space. We also establish the asymptotic normality of FPG estimation error with a precise covariance characterization, which is further shown to be statistically optimal with a matching Cramer-Rao lower bound. Empirically, we evaluate the performance of FPG on both policy gradient estimation and policy optimization, using either softmax tabular or ReLU policy networks. Under various metrics, our results show that FPG significantly outperforms existing off-policy PG estimation methods based on importance sampling and variance reduction techniques.", "bibtex": "@InProceedings{pmlr-v162-ni22b,\n title = \t {Optimal Estimation of Policy Gradient via Double Fitted Iteration},\n author = {Ni, Chengzhuo and Zhang, Ruiqi and Ji, Xiang and Zhang, Xuezhou and Wang, Mengdi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16724--16783},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ni22b/ni22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/ni22b.html},\n abstract = \t {Policy gradient (PG) estimation becomes a challenge when we are not allowed to sample with the target policy but only have access to a dataset generated by some unknown behavior policy. Conventional methods for off-policy PG estimation often suffer from either significant bias or exponentially large variance. In this paper, we propose the double Fitted PG estimation (FPG) algorithm. FPG can work with an arbitrary policy parameterization, assuming access to a Bellman-complete value function class. In the case of linear value function approximation, we provide a tight finite-sample upper bound on policy gradient estimation error, that is governed by the amount of distribution mismatch measured in feature space. We also establish the asymptotic normality of FPG estimation error with a precise covariance characterization, which is further shown to be statistically optimal with a matching Cramer-Rao lower bound. Empirically, we evaluate the performance of FPG on both policy gradient estimation and policy optimization, using either softmax tabular or ReLU policy networks. Under various metrics, our results show that FPG significantly outperforms existing off-policy PG estimation methods based on importance sampling and variance reduction techniques.}\n}", "pdf": "https://proceedings.mlr.press/v162/ni22b/ni22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/ni22b-supp.zip", "pdf_size": 982621, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3812428366726963685&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Electrical and Computer Engineering, Princeton University, Princeton, NJ, USA+School of Mathematical Science, Peking University, Beijing, China; School of Mathematical Science, Peking University, Beijing, China; Department of Electrical and Computer Engineering, Princeton University, Princeton, NJ, USA; Department of Electrical and Computer Engineering, Princeton University, Princeton, NJ, USA; Department of Electrical and Computer Engineering, Princeton University, Princeton, NJ, USA", "aff_domain": "princeton.edu; ; ; ;princeton.edu", "email": "princeton.edu; ; ; ;princeton.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/ni22b.html", "aff_unique_index": "0+1;1;0;0;0", "aff_unique_norm": "Princeton University;Peking University", "aff_unique_dep": "Department of Electrical and Computer Engineering;School of Mathematical Science", "aff_unique_url": "https://www.princeton.edu;http://www.pku.edu.cn", "aff_unique_abbr": "Princeton;PKU", "aff_campus_unique_index": "0+1;1;0;0;0", "aff_campus_unique": "Princeton;Beijing", "aff_country_unique_index": "0+1;1;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Optimal and Efficient Dynamic Regret Algorithms for Non-Stationary Dueling Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17675", "id": "17675", "proceeding": "https://proceedings.mlr.press/v162/saha22b.html", "poster": "/media/PosterPDFs/ICML%202022/5314b9674c86e3f9d1ba25ef9bb32895_wvsbA9S.png?t=1657809990.7821784", "slides": "", "author_site": "Aadirupa Saha, Shubham Gupta", "author": "Aadirupa Saha; Shubham Gupta", "abstract": "We study the problem of", "bibtex": "@InProceedings{pmlr-v162-saha22b,\n title = \t {Optimal and Efficient Dynamic Regret Algorithms for Non-Stationary Dueling Bandits},\n author = {Saha, Aadirupa and Gupta, Shubham},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19027--19049},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/saha22b/saha22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/saha22b.html},\n abstract = \t {We study the problem of", "pdf": "https://proceedings.mlr.press/v162/saha22b/saha22b.pdf", "supp": "", "pdf_size": 715861, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4830113447025588262&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Microsoft Research, New York City, United States; IBM Research, Orsay, France", "aff_domain": "microsoft.com;ibm.com", "email": "microsoft.com;ibm.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/saha22b.html", "aff_unique_index": "0;1", "aff_unique_norm": "Microsoft;IBM", "aff_unique_dep": "Microsoft Research;IBM Research", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.ibm.com/research", "aff_unique_abbr": "MSR;IBM", "aff_campus_unique_index": "0;1", "aff_campus_unique": "New York City;Orsay", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;France" }, { "title": "Optimally Controllable Perceptual Lossy Compression", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16875", "id": "16875", "proceeding": "https://proceedings.mlr.press/v162/yan22b.html", "poster": "/media/PosterPDFs/ICML%202022/2e855f9489df0712b4bd8ea9e2848c5a.png?t=1657272117.7102563", "slides": "", "author_site": "Zeyu Yan, Fei Wen, Peilin Liu", "author": "Zeyu Yan; Fei Wen; Peilin Liu", "abstract": "Recent studies in lossy compression show that distortion and perceptual quality are at odds with each other, which put forward the tradeoff between distortion and perception (D-P). Intuitively, to attain different perceptual quality, different decoders have to be trained. In this paper, we present a nontrivial finding that only two decoders are sufficient for optimally achieving arbitrary (an infinite number of different) D-P tradeoff. We prove that arbitrary points of the D-P tradeoff bound can be achieved by a simple linear interpolation between the outputs of a minimum MSE decoder and a specifically constructed perfect perceptual decoder. Meanwhile, the perceptual quality (in terms of the squared Wasserstein-2 distance metric) can be quantitatively controlled by the interpolation factor. Furthermore, to construct a perfect perceptual decoder, we propose two theoretically optimal training frameworks. The new frameworks are different from the distortion-plus-adversarial loss based heuristic framework widely used in existing methods, which are not only theoretically optimal but also can yield state-of-the-art performance in practical perceptual decoding. Finally, we validate our theoretical finding and demonstrate the superiority of our frameworks via experiments. Code is available at: https://github.com/ZeyuYan/Controllable-Perceptual-Compression", "bibtex": "@InProceedings{pmlr-v162-yan22b,\n title = \t {Optimally Controllable Perceptual Lossy Compression},\n author = {Yan, Zeyu and Wen, Fei and Liu, Peilin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24911--24928},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yan22b/yan22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/yan22b.html},\n abstract = \t {Recent studies in lossy compression show that distortion and perceptual quality are at odds with each other, which put forward the tradeoff between distortion and perception (D-P). Intuitively, to attain different perceptual quality, different decoders have to be trained. In this paper, we present a nontrivial finding that only two decoders are sufficient for optimally achieving arbitrary (an infinite number of different) D-P tradeoff. We prove that arbitrary points of the D-P tradeoff bound can be achieved by a simple linear interpolation between the outputs of a minimum MSE decoder and a specifically constructed perfect perceptual decoder. Meanwhile, the perceptual quality (in terms of the squared Wasserstein-2 distance metric) can be quantitatively controlled by the interpolation factor. Furthermore, to construct a perfect perceptual decoder, we propose two theoretically optimal training frameworks. The new frameworks are different from the distortion-plus-adversarial loss based heuristic framework widely used in existing methods, which are not only theoretically optimal but also can yield state-of-the-art performance in practical perceptual decoding. Finally, we validate our theoretical finding and demonstrate the superiority of our frameworks via experiments. Code is available at: https://github.com/ZeyuYan/Controllable-Perceptual-Compression}\n}", "pdf": "https://proceedings.mlr.press/v162/yan22b/yan22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/yan22b-supp.zip", "pdf_size": 13576963, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15214339197144115082&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Brain-inspired Application Technology Center (BATC), School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University, Shanghai, China; Brain-inspired Application Technology Center (BATC), School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University, Shanghai, China; Brain-inspired Application Technology Center (BATC), School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University, Shanghai, China", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "email": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "github": "https://github.com/ZeyuYan/Controllable-Perceptual-Compression", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/yan22b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "School of Electronic Information and Electrical Engineering", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Optimistic Linear Support and Successor Features as a Basis for Optimal Policy Transfer", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17315", "id": "17315", "proceeding": "https://proceedings.mlr.press/v162/alegre22a.html", "poster": "/media/PosterPDFs/ICML%202022/da0dba87d95286d836e37ca60ab1e734_LY3UWtT.png?t=1657765099.0793824", "slides": "", "author_site": "Lucas N. Alegre, Ana Lucia Cetertich Bazzan, Bruno C. da Silva", "author": "Lucas Nunes Alegre; Ana Bazzan; Bruno C. Da Silva", "abstract": "In many real-world applications, reinforcement learning (RL) agents might have to solve multiple tasks, each one typically modeled via a reward function. If reward functions are expressed linearly, and the agent has previously learned a set of policies for different tasks, successor features (SFs) can be exploited to combine such policies and identify reasonable solutions for new problems. However, the identified solutions are not guaranteed to be optimal. We introduce a novel algorithm that addresses this limitation. It allows RL agents to combine existing policies and directly identify optimal policies for arbitrary new problems, without requiring any further interactions with the environment. We first show (under mild assumptions) that the transfer learning problem tackled by SFs is equivalent to the problem of learning to optimize multiple objectives in RL. We then introduce an SF-based extension of the Optimistic Linear Support algorithm to learn a set of policies whose SFs form a convex coverage set. We prove that policies in this set can be combined via generalized policy improvement to construct optimal behaviors for any new linearly-expressible tasks, without requiring any additional training samples. We empirically show that our method outperforms state-of-the-art competing algorithms both in discrete and continuous domains under value function approximation.", "bibtex": "@InProceedings{pmlr-v162-alegre22a,\n title = \t {Optimistic Linear Support and Successor Features as a Basis for Optimal Policy Transfer},\n author = {Alegre, Lucas Nunes and Bazzan, Ana and Silva, Bruno C. Da},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {394--413},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/alegre22a/alegre22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/alegre22a.html},\n abstract = \t {In many real-world applications, reinforcement learning (RL) agents might have to solve multiple tasks, each one typically modeled via a reward function. If reward functions are expressed linearly, and the agent has previously learned a set of policies for different tasks, successor features (SFs) can be exploited to combine such policies and identify reasonable solutions for new problems. However, the identified solutions are not guaranteed to be optimal. We introduce a novel algorithm that addresses this limitation. It allows RL agents to combine existing policies and directly identify optimal policies for arbitrary new problems, without requiring any further interactions with the environment. We first show (under mild assumptions) that the transfer learning problem tackled by SFs is equivalent to the problem of learning to optimize multiple objectives in RL. We then introduce an SF-based extension of the Optimistic Linear Support algorithm to learn a set of policies whose SFs form a convex coverage set. We prove that policies in this set can be combined via generalized policy improvement to construct optimal behaviors for any new linearly-expressible tasks, without requiring any additional training samples. We empirically show that our method outperforms state-of-the-art competing algorithms both in discrete and continuous domains under value function approximation.}\n}", "pdf": "https://proceedings.mlr.press/v162/alegre22a/alegre22a.pdf", "supp": "", "pdf_size": 1078010, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=130731457432112857&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Institute of Informatics, Federal University of Rio Grande do Sul, Porto Alegre, RS, Brazil; Institute of Informatics, Federal University of Rio Grande do Sul, Porto Alegre, RS, Brazil; University of Massachusetts Amherst, MA", "aff_domain": "inf.ufrgs.br; ; ", "email": "inf.ufrgs.br; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/alegre22a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Federal University of Rio Grande do Sul;University of Massachusetts Amherst", "aff_unique_dep": "Institute of Informatics;", "aff_unique_url": "https://www.ufrgs.br;https://www.umass.edu", "aff_unique_abbr": "UFRGS;UMass Amherst", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Porto Alegre;Amherst", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Brazil;United States" }, { "title": "Optimization-Derived Learning with Essential Convergence Analysis of Training and Hyper-training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17659", "id": "17659", "proceeding": "https://proceedings.mlr.press/v162/liu22j.html", "poster": "/media/PosterPDFs/ICML%202022/d2cdf047a6674cef251d56544a3cf029_iQPwyJG.png?t=1657518776.7231135", "slides": "", "author_site": "Risheng Liu, Xuan Liu, Shangzhi Zeng, Jin Zhang, Yixuan ZHANG", "author": "Risheng Liu; Xuan Liu; Shangzhi Zeng; Jin Zhang; Yixuan Zhang", "abstract": "Recently, Optimization-Derived Learning (ODL) has attracted attention from learning and vision areas, which designs learning models from the perspective of optimization. However, previous ODL approaches regard the training and hyper-training procedures as two separated stages, meaning that the hyper-training variables have to be fixed during the training process, and thus it is also impossible to simultaneously obtain the convergence of training and hyper-training variables. In this work, we design a Generalized Krasnoselskii-Mann (GKM) scheme based on fixed-point iterations as our fundamental ODL module, which unifies existing ODL methods as special cases. Under the GKM scheme, a Bilevel Meta Optimization (BMO) algorithmic framework is constructed to solve the optimal training and hyper-training variables together. We rigorously prove the essential joint convergence of the fixed-point iteration for training and the process of optimizing hyper-parameters for hyper-training, both on the approximation quality, and on the stationary analysis. Experiments demonstrate the efficiency of BMO with competitive performance on sparse coding and real-world applications such as image deconvolution and rain streak removal.", "bibtex": "@InProceedings{pmlr-v162-liu22j,\n title = \t {Optimization-Derived Learning with Essential Convergence Analysis of Training and Hyper-training},\n author = {Liu, Risheng and Liu, Xuan and Zeng, Shangzhi and Zhang, Jin and Zhang, Yixuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13825--13856},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22j/liu22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22j.html},\n abstract = \t {Recently, Optimization-Derived Learning (ODL) has attracted attention from learning and vision areas, which designs learning models from the perspective of optimization. However, previous ODL approaches regard the training and hyper-training procedures as two separated stages, meaning that the hyper-training variables have to be fixed during the training process, and thus it is also impossible to simultaneously obtain the convergence of training and hyper-training variables. In this work, we design a Generalized Krasnoselskii-Mann (GKM) scheme based on fixed-point iterations as our fundamental ODL module, which unifies existing ODL methods as special cases. Under the GKM scheme, a Bilevel Meta Optimization (BMO) algorithmic framework is constructed to solve the optimal training and hyper-training variables together. We rigorously prove the essential joint convergence of the fixed-point iteration for training and the process of optimizing hyper-parameters for hyper-training, both on the approximation quality, and on the stationary analysis. Experiments demonstrate the efficiency of BMO with competitive performance on sparse coding and real-world applications such as image deconvolution and rain streak removal.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22j/liu22j.pdf", "supp": "", "pdf_size": 6989451, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9487406492982833153&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "DUT-RU International School of Information Science and Engineering, Dalian University of Technology, Dalian, Liaoning, China+Key Laboratory for Ubiquitous Network and Service Software of Liaoning Province, Dalian, Liaoning, China+Peng Cheng Laboratory, Shenzhen, Guangdong, China; DUT-RU International School of Information Science and Engineering, Dalian University of Technology, Dalian, Liaoning, China+Key Laboratory for Ubiquitous Network and Service Software of Liaoning Province, Dalian, Liaoning, China; Department of Mathematics and Statistics, University of Victoria, Victoria, British Columbia, Canada; Department of Mathematics, SUSTech International Center for Mathematics, Southern University of Science and Technology, Shenzhen, Guangdong, China+National Center for Applied Mathematics Shenzhen, Shenzhen, Guangdong, China; Department of Mathematics, SUSTech International Center for Mathematics, Southern University of Science and Technology, Shenzhen, Guangdong, China+National Center for Applied Mathematics Shenzhen, Shenzhen, Guangdong, China", "aff_domain": "sustech.edu.cn; ; ; ; ", "email": "sustech.edu.cn; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/liu22j.html", "aff_unique_index": "0+1+2;0+1;3;4+5;4+5", "aff_unique_norm": "Dalian University of Technology;Key Laboratory for Ubiquitous Network and Service Software of Liaoning Province;Pengcheng Laboratory;University of Victoria;Southern University of Science and Technology;National Center for Applied Mathematics", "aff_unique_dep": "International School of Information Science and Engineering;;Peng Cheng Laboratory;Department of Mathematics and Statistics;Department of Mathematics;", "aff_unique_url": "http://en.dlut.edu.cn/;;;https://www.uvic.ca;https://www.sustech.edu.cn;", "aff_unique_abbr": "DUT;;;UVic;SUSTech;", "aff_campus_unique_index": "0+0+1;0+0;2;1+1;1+1", "aff_campus_unique": "Dalian;Shenzhen;Victoria", "aff_country_unique_index": "0+0+0;0+0;1;0+0;0+0", "aff_country_unique": "China;Canada" }, { "title": "Optimization-Induced Graph Implicit Nonlinear Diffusion", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18147", "id": "18147", "proceeding": "https://proceedings.mlr.press/v162/chen22z.html", "poster": "/media/PosterPDFs/ICML%202022/2bb0502c80b7432eee4c5847a5fd077b.png?t=1657273814.3181093", "slides": "", "author_site": "Qi Chen, Yifei Wang, Yisen Wang, Jiansheng Yang, Zhouchen Lin", "author": "Qi Chen; Yifei Wang; Yisen Wang; Jiansheng Yang; Zhouchen Lin", "abstract": "Due to the over-smoothing issue, most existing graph neural networks can only capture limited dependencies with their inherently finite aggregation layers. To overcome this limitation, we propose a new kind of graph convolution, called Graph Implicit Nonlinear Diffusion (GIND), which implicitly has access to infinite hops of neighbors while adaptively aggregating features with nonlinear diffusion to prevent over-smoothing. Notably, we show that the learned representation can be formalized as the minimizer of an explicit convex optimization objective. With this property, we can theoretically characterize the equilibrium of our GIND from an optimization perspective. More interestingly, we can induce new structural variants by modifying the corresponding optimization objective. To be specific, we can embed prior properties to the equilibrium, as well as introducing skip connections to promote training stability. Extensive experiments show that GIND is good at capturing long-range dependencies, and performs well on both homophilic and heterophilic graphs with nonlinear diffusion. Moreover, we show that the optimization-induced variants of our models can boost the performance and improve training stability and efficiency as well. As a result, our GIND obtains significant improvements on both node-level and graph-level tasks.", "bibtex": "@InProceedings{pmlr-v162-chen22z,\n title = \t {Optimization-Induced Graph Implicit Nonlinear Diffusion},\n author = {Chen, Qi and Wang, Yifei and Wang, Yisen and Yang, Jiansheng and Lin, Zhouchen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3648--3661},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22z/chen22z.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22z.html},\n abstract = \t {Due to the over-smoothing issue, most existing graph neural networks can only capture limited dependencies with their inherently finite aggregation layers. To overcome this limitation, we propose a new kind of graph convolution, called Graph Implicit Nonlinear Diffusion (GIND), which implicitly has access to infinite hops of neighbors while adaptively aggregating features with nonlinear diffusion to prevent over-smoothing. Notably, we show that the learned representation can be formalized as the minimizer of an explicit convex optimization objective. With this property, we can theoretically characterize the equilibrium of our GIND from an optimization perspective. More interestingly, we can induce new structural variants by modifying the corresponding optimization objective. To be specific, we can embed prior properties to the equilibrium, as well as introducing skip connections to promote training stability. Extensive experiments show that GIND is good at capturing long-range dependencies, and performs well on both homophilic and heterophilic graphs with nonlinear diffusion. Moreover, we show that the optimization-induced variants of our models can boost the performance and improve training stability and efficiency as well. As a result, our GIND obtains significant improvements on both node-level and graph-level tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22z/chen22z.pdf", "supp": "", "pdf_size": 420272, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1600506523476072350&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "School of Mathematical Sciences, Peking University, China; School of Mathematical Sciences, Peking University, China; Key Lab. of Machine Perception (MoE), School of Artificial Intelligence, Peking University, China+Institute for Artificial Intelligence, Peking University, China+Peng Cheng Laboratory, China; School of Mathematical Sciences, Peking University, China; Key Lab. of Machine Perception (MoE), School of Artificial Intelligence, Peking University, China+Institute for Artificial Intelligence, Peking University, China+Peng Cheng Laboratory, China", "aff_domain": "pku.edu.cn; ; ; ;pku.edu.cn", "email": "pku.edu.cn; ; ; ;pku.edu.cn", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/chen22z.html", "aff_unique_index": "0;0;0+0+1;0;0+0+1", "aff_unique_norm": "Peking University;Pengcheng Laboratory", "aff_unique_dep": "School of Mathematical Sciences;Peng Cheng Laboratory", "aff_unique_url": "http://www.pku.edu.cn;", "aff_unique_abbr": "PKU;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0+0;0;0+0+0", "aff_country_unique": "China" }, { "title": "Optimizing Sequential Experimental Design with Deep Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17107", "id": "17107", "proceeding": "https://proceedings.mlr.press/v162/blau22a.html", "poster": "/media/PosterPDFs/ICML%202022/42778ef0b5805a96f9511e20b5611fce.png?t=1657175734.209684", "slides": "", "author_site": "Tom Blau, Edwin V Bonilla, Iadine Chades, Amir Dezfouli", "author": "Tom Blau; Edwin V. Bonilla; Iadine Chades; Amir Dezfouli", "abstract": "Bayesian approaches developed to solve the optimal design of sequential experiments are mathematically elegant but computationally challenging. Recently, techniques using amortization have been proposed to make these Bayesian approaches practical, by training a parameterized policy that proposes designs efficiently at deployment time. However, these methods may not sufficiently explore the design space, require access to a differentiable probabilistic model and can only optimize over continuous design spaces. Here, we address these limitations by showing that the problem of optimizing policies can be reduced to solving a Markov decision process (MDP). We solve the equivalent MDP with modern deep reinforcement learning techniques. Our experiments show that our approach is also computationally efficient at deployment time and exhibits state-of-the-art performance on both continuous and discrete design spaces, even when the probabilistic model is a black box.", "bibtex": "@InProceedings{pmlr-v162-blau22a,\n title = \t {Optimizing Sequential Experimental Design with Deep Reinforcement Learning},\n author = {Blau, Tom and Bonilla, Edwin V. and Chades, Iadine and Dezfouli, Amir},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2107--2128},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/blau22a/blau22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/blau22a.html},\n abstract = \t {Bayesian approaches developed to solve the optimal design of sequential experiments are mathematically elegant but computationally challenging. Recently, techniques using amortization have been proposed to make these Bayesian approaches practical, by training a parameterized policy that proposes designs efficiently at deployment time. However, these methods may not sufficiently explore the design space, require access to a differentiable probabilistic model and can only optimize over continuous design spaces. Here, we address these limitations by showing that the problem of optimizing policies can be reduced to solving a Markov decision process (MDP). We solve the equivalent MDP with modern deep reinforcement learning techniques. Our experiments show that our approach is also computationally efficient at deployment time and exhibits state-of-the-art performance on both continuous and discrete design spaces, even when the probabilistic model is a black box.}\n}", "pdf": "https://proceedings.mlr.press/v162/blau22a/blau22a.pdf", "supp": "", "pdf_size": 1281120, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17698300138792965088&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "CSIRO\u2019s Data61, Australia; CSIRO\u2019s Data61, Australia; CSIRO\u2019s Land and Water, Australia; CSIRO\u2019s Data61, Australia", "aff_domain": "data61.csiro.au; ; ; ", "email": "data61.csiro.au; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/blau22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "CSIRO", "aff_unique_dep": "Data61", "aff_unique_url": "https://www.csiro.au", "aff_unique_abbr": "CSIRO", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Australia" }, { "title": "Optimizing Tensor Network Contraction Using Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17917", "id": "17917", "proceeding": "https://proceedings.mlr.press/v162/meirom22a.html", "poster": "/media/PosterPDFs/ICML%202022/2bc8ae25856bc2a6a1333d1331a3b7a6.png?t=1658245781.9230518", "slides": "", "author_site": "Eli Meirom, Haggai Maron, Shie Mannor, Gal Chechik", "author": "Eli Meirom; Haggai Maron; Shie Mannor; Gal Chechik", "abstract": "Quantum Computing (QC) stands to revolutionize computing, but is currently still limited. To develop and test quantum algorithms today, quantum circuits are often simulated on classical computers. Simulating a complex quantum circuit requires computing the contraction of a large network of tensors. The order (path) of contraction can have a drastic effect on the computing cost, but finding an efficient order is a challenging combinatorial optimization problem. We propose a Reinforcement Learning (RL) approach combined with Graph Neural Networks (GNN) to address the contraction ordering problem. The problem is extremely challenging due to the huge search space, the heavy-tailed reward distribution, and the challenging credit assignment. We show how a carefully implemented RL-agent that uses a GNN as the basic policy construct can address these challenges and obtain significant improvements over state-of-the-art techniques in three varieties of circuits, including the largest scale networks used in contemporary QC.", "bibtex": "@InProceedings{pmlr-v162-meirom22a,\n title = \t {Optimizing Tensor Network Contraction Using Reinforcement Learning},\n author = {Meirom, Eli and Maron, Haggai and Mannor, Shie and Chechik, Gal},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15278--15292},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/meirom22a/meirom22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/meirom22a.html},\n abstract = \t {Quantum Computing (QC) stands to revolutionize computing, but is currently still limited. To develop and test quantum algorithms today, quantum circuits are often simulated on classical computers. Simulating a complex quantum circuit requires computing the contraction of a large network of tensors. The order (path) of contraction can have a drastic effect on the computing cost, but finding an efficient order is a challenging combinatorial optimization problem. We propose a Reinforcement Learning (RL) approach combined with Graph Neural Networks (GNN) to address the contraction ordering problem. The problem is extremely challenging due to the huge search space, the heavy-tailed reward distribution, and the challenging credit assignment. We show how a carefully implemented RL-agent that uses a GNN as the basic policy construct can address these challenges and obtain significant improvements over state-of-the-art techniques in three varieties of circuits, including the largest scale networks used in contemporary QC.}\n}", "pdf": "https://proceedings.mlr.press/v162/meirom22a/meirom22a.pdf", "supp": "", "pdf_size": 964883, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17337310049793317740&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Nvidia research, Israel; Nvidia research, Israel; Nvidia research, Israel; Nvidia research, Israel", "aff_domain": "nvidia.com; ; ; ", "email": "nvidia.com; ; ; ", "github": "", "project": "https://nv-research-tlv.netlify.app/publication/tensor_contraction/", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/meirom22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "NVIDIA", "aff_unique_dep": "NVIDIA Research", "aff_unique_url": "https://www.nvidia.com/research", "aff_unique_abbr": "NV", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Israel" }, { "title": "Orchestra: Unsupervised Federated Learning via Globally Consistent Clustering", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18301", "id": "18301", "proceeding": "https://proceedings.mlr.press/v162/lubana22a.html", "poster": "/media/PosterPDFs/ICML%202022/80a8155eb153025ea1d513d0b2c4b675.png?t=1657917947.7470691", "slides": "", "author_site": "Ekdeep Singh Lubana, Chi Ian Tang, Fahim Kawsar, Robert Dick, Akhil Mathur", "author": "Ekdeep Lubana; Chi Ian Tang; Fahim Kawsar; Robert Dick; Akhil Mathur", "abstract": "Federated learning is generally used in tasks where labels are readily available (e.g., next word prediction). Relaxing this constraint requires design of unsupervised learning techniques that can support desirable properties for federated training: robustness to statistical/systems heterogeneity, scalability with number of participants, and communication efficiency. Prior work on this topic has focused on directly extending centralized self-supervised learning techniques, which are not designed to have the properties listed above. To address this situation, we propose Orchestra, a novel unsupervised federated learning technique that exploits the federation\u2019s hierarchy to orchestrate a distributed clustering task and enforce a globally consistent partitioning of clients\u2019 data into discriminable clusters. We show the algorithmic pipeline in Orchestra guarantees good generalization performance under a linear probe, allowing it to outperform alternative techniques in a broad range of conditions, including variation in heterogeneity, number of clients, participation ratio, and local epochs.", "bibtex": "@InProceedings{pmlr-v162-lubana22a,\n title = \t {Orchestra: Unsupervised Federated Learning via Globally Consistent Clustering},\n author = {Lubana, Ekdeep and Tang, Chi Ian and Kawsar, Fahim and Dick, Robert and Mathur, Akhil},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14461--14484},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lubana22a/lubana22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lubana22a.html},\n abstract = \t {Federated learning is generally used in tasks where labels are readily available (e.g., next word prediction). Relaxing this constraint requires design of unsupervised learning techniques that can support desirable properties for federated training: robustness to statistical/systems heterogeneity, scalability with number of participants, and communication efficiency. Prior work on this topic has focused on directly extending centralized self-supervised learning techniques, which are not designed to have the properties listed above. To address this situation, we propose Orchestra, a novel unsupervised federated learning technique that exploits the federation\u2019s hierarchy to orchestrate a distributed clustering task and enforce a globally consistent partitioning of clients\u2019 data into discriminable clusters. We show the algorithmic pipeline in Orchestra guarantees good generalization performance under a linear probe, allowing it to outperform alternative techniques in a broad range of conditions, including variation in heterogeneity, number of clients, participation ratio, and local epochs.}\n}", "pdf": "https://proceedings.mlr.press/v162/lubana22a/lubana22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/lubana22a-supp.zip", "pdf_size": 11251644, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12370876234487104592&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "EECS Department, University of Michigan, Ann Arbor, USA+Work performed while the first two authors were interns at Nokia Bell Labs, UK; University of Cambridge, UK; Nokia Bell Labs, Cambridge, UK; EECS Department, University of Michigan, Ann Arbor, USA; Nokia Bell Labs, Cambridge, UK", "aff_domain": "umich.edu; ; ; ; ", "email": "umich.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/lubana22a.html", "aff_unique_index": "0+1;2;1;0;1", "aff_unique_norm": "University of Michigan;Nokia Bell Labs;University of Cambridge", "aff_unique_dep": "EECS Department;;", "aff_unique_url": "https://www.umich.edu;https://www.nokia.com bell-labs/;https://www.cam.ac.uk", "aff_unique_abbr": "UM;Nokia Bell Labs;Cambridge", "aff_campus_unique_index": "0;2;2;0;2", "aff_campus_unique": "Ann Arbor;;Cambridge", "aff_country_unique_index": "0+1;1;1;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Order Constraints in Optimal Transport", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16075", "id": "16075", "proceeding": "https://proceedings.mlr.press/v162/lim22b.html", "poster": "/media/PosterPDFs/ICML%202022/64eec0c3fb6b12c43f51ec9e9c773fed.png?t=1657781494.5603805", "slides": "", "author_site": "Yu Chin Fabian Lim, Laura Wynter, Shiau Hong Lim", "author": "Yu Chin Fabian Lim; Laura Wynter; Shiau Hong Lim", "abstract": "Optimal transport is a framework for comparing measures whereby a cost is incurred for transporting one measure to another. Recent works have aimed to improve optimal transport plans through the introduction of various forms of structure. We introduce novel order constraints into the optimal transport formulation to allow for the incorporation of structure. We define an efficient method for obtaining explainable solutions to the new formulation that scales far better than standard approaches. The theoretical properties of the method are provided. We demonstrate experimentally that order constraints improve explainability using the e-SNLI (Stanford Natural Language Inference) dataset that includes human-annotated rationales as well as on several image color transfer examples.", "bibtex": "@InProceedings{pmlr-v162-lim22b,\n title = \t {Order Constraints in Optimal Transport},\n author = {Lim, Yu Chin Fabian and Wynter, Laura and Lim, Shiau Hong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13313--13333},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lim22b/lim22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/lim22b.html},\n abstract = \t {Optimal transport is a framework for comparing measures whereby a cost is incurred for transporting one measure to another. Recent works have aimed to improve optimal transport plans through the introduction of various forms of structure. We introduce novel order constraints into the optimal transport formulation to allow for the incorporation of structure. We define an efficient method for obtaining explainable solutions to the new formulation that scales far better than standard approaches. The theoretical properties of the method are provided. We demonstrate experimentally that order constraints improve explainability using the e-SNLI (Stanford Natural Language Inference) dataset that includes human-annotated rationales as well as on several image color transfer examples.}\n}", "pdf": "https://proceedings.mlr.press/v162/lim22b/lim22b.pdf", "supp": "", "pdf_size": 4750438, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1063075229818760095&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "IBM Research, Singapore; IBM Research, Singapore; IBM Research, Singapore", "aff_domain": "sg.ibm.com; ; ", "email": "sg.ibm.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/lim22b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "IBM", "aff_unique_dep": "IBM Research", "aff_unique_url": "https://www.ibm.com/research", "aff_unique_abbr": "IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "Out-of-Distribution Detection with Deep Nearest Neighbors", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16493", "id": "16493", "proceeding": "https://proceedings.mlr.press/v162/sun22d.html", "poster": "/media/PosterPDFs/ICML%202022/0a5c79b1eaf15445da252ada718857e9.png?t=1657511421.940845", "slides": "", "author_site": "Yiyou Sun, Yifei Ming, Jerry Zhu, Yixuan Li", "author": "Yiyou Sun; Yifei Ming; Xiaojin Zhu; Yixuan Li", "abstract": "Out-of-distribution (OOD) detection is a critical task for deploying machine learning models in the open world. Distance-based methods have demonstrated promise, where testing samples are detected as OOD if they are relatively far away from in-distribution (ID) data. However, prior methods impose a strong distributional assumption of the underlying feature space, which may not always hold. In this paper, we explore the efficacy of non-parametric nearest-neighbor distance for OOD detection, which has been largely overlooked in the literature. Unlike prior works, our method does not impose any distributional assumption, hence providing stronger flexibility and generality. We demonstrate the effectiveness of nearest-neighbor-based OOD detection on several benchmarks and establish superior performance. Under the same model trained on ImageNet-1k, our method substantially reduces the false positive rate (FPR@TPR95) by 24.77% compared to a strong baseline SSD+, which uses a parametric approach Mahalanobis distance in detection. Code is available: https://github.com/deeplearning-wisc/knn-ood.", "bibtex": "@InProceedings{pmlr-v162-sun22d,\n title = \t {Out-of-Distribution Detection with Deep Nearest Neighbors},\n author = {Sun, Yiyou and Ming, Yifei and Zhu, Xiaojin and Li, Yixuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20827--20840},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sun22d/sun22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/sun22d.html},\n abstract = \t {Out-of-distribution (OOD) detection is a critical task for deploying machine learning models in the open world. Distance-based methods have demonstrated promise, where testing samples are detected as OOD if they are relatively far away from in-distribution (ID) data. However, prior methods impose a strong distributional assumption of the underlying feature space, which may not always hold. In this paper, we explore the efficacy of non-parametric nearest-neighbor distance for OOD detection, which has been largely overlooked in the literature. Unlike prior works, our method does not impose any distributional assumption, hence providing stronger flexibility and generality. We demonstrate the effectiveness of nearest-neighbor-based OOD detection on several benchmarks and establish superior performance. Under the same model trained on ImageNet-1k, our method substantially reduces the false positive rate (FPR@TPR95) by 24.77% compared to a strong baseline SSD+, which uses a parametric approach Mahalanobis distance in detection. Code is available: https://github.com/deeplearning-wisc/knn-ood.}\n}", "pdf": "https://proceedings.mlr.press/v162/sun22d/sun22d.pdf", "supp": "", "pdf_size": 1307690, "gs_citation": 635, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8587930909818673494&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Computer Sciences, University of Wisconsin - Madison; Department of Computer Sciences, University of Wisconsin - Madison; Department of Computer Sciences, University of Wisconsin - Madison; Department of Computer Sciences, University of Wisconsin - Madison", "aff_domain": "cs.wisc.edu;cs.wisc.edu;cs.wisc.edu;cs.wisc.edu", "email": "cs.wisc.edu;cs.wisc.edu;cs.wisc.edu;cs.wisc.edu", "github": "https://github.com/deeplearning-wisc/knn-ood", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/sun22d.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "Department of Computer Sciences", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Overcoming Oscillations in Quantization-Aware Training", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17839", "id": "17839", "proceeding": "https://proceedings.mlr.press/v162/nagel22a.html", "poster": "/media/PosterPDFs/ICML%202022/effffa8deef3c927fefc014850129bb6.png?t=1657719015.0068672", "slides": "", "author_site": "Markus Nagel, Marios Fournarakis, Yelysei Bondarenko, Tijmen Blankevoort", "author": "Markus Nagel; Marios Fournarakis; Yelysei Bondarenko; Tijmen Blankevoort", "abstract": "When training neural networks with simulated quantization, we observe that quantized weights can, rather unexpectedly, oscillate between two grid-points. The importance of this effect and its impact on quantization-aware training (QAT) are not well-understood or investigated in literature. In this paper, we delve deeper into the phenomenon of weight oscillations and show that it can lead to a significant accuracy degradation due to wrongly estimated batch-normalization statistics during inference and increased noise during training. These effects are particularly pronounced in low-bit ($\\leq$ 4-bits) quantization of efficient networks with depth-wise separable layers, such as MobileNets and EfficientNets. In our analysis we investigate several previously proposed QAT algorithms and show that most of these are unable to overcome oscillations. Finally, we propose two novel QAT algorithms to overcome oscillations during training: oscillation dampening and iterative weight freezing. We demonstrate that our algorithms achieve state-of-the-art accuracy for low-bit (3 & 4 bits) weight and activation quantization of efficient architectures, such as MobileNetV2, MobileNetV3, and EfficentNet-lite on ImageNet. Our source code is available at https://github.com/qualcomm-ai-research/oscillations-qat.", "bibtex": "@InProceedings{pmlr-v162-nagel22a,\n title = \t {Overcoming Oscillations in Quantization-Aware Training},\n author = {Nagel, Markus and Fournarakis, Marios and Bondarenko, Yelysei and Blankevoort, Tijmen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16318--16330},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nagel22a/nagel22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nagel22a.html},\n abstract = \t {When training neural networks with simulated quantization, we observe that quantized weights can, rather unexpectedly, oscillate between two grid-points. The importance of this effect and its impact on quantization-aware training (QAT) are not well-understood or investigated in literature. In this paper, we delve deeper into the phenomenon of weight oscillations and show that it can lead to a significant accuracy degradation due to wrongly estimated batch-normalization statistics during inference and increased noise during training. These effects are particularly pronounced in low-bit ($\\leq$ 4-bits) quantization of efficient networks with depth-wise separable layers, such as MobileNets and EfficientNets. In our analysis we investigate several previously proposed QAT algorithms and show that most of these are unable to overcome oscillations. Finally, we propose two novel QAT algorithms to overcome oscillations during training: oscillation dampening and iterative weight freezing. We demonstrate that our algorithms achieve state-of-the-art accuracy for low-bit (3 & 4 bits) weight and activation quantization of efficient architectures, such as MobileNetV2, MobileNetV3, and EfficentNet-lite on ImageNet. Our source code is available at https://github.com/qualcomm-ai-research/oscillations-qat.}\n}", "pdf": "https://proceedings.mlr.press/v162/nagel22a/nagel22a.pdf", "supp": "", "pdf_size": 1942240, "gs_citation": 135, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7420900147449297727&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Qualcomm AI Research; Qualcomm AI Research; Qualcomm AI Research; Qualcomm AI Research", "aff_domain": "qti.qualcomm.com;qti.qualcomm.com; ; ", "email": "qti.qualcomm.com;qti.qualcomm.com; ; ", "github": "https://github.com/qualcomm-ai-research/oscillations-qat", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/nagel22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Qualcomm", "aff_unique_dep": "Qualcomm AI Research", "aff_unique_url": "https://www.qualcomm.com/research", "aff_unique_abbr": "QAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "PAC-Bayesian Bounds on Rate-Efficient Classifiers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16337", "id": "16337", "proceeding": "https://proceedings.mlr.press/v162/abbas22a.html", "poster": "", "slides": "", "author_site": "Alhabib Abbas, Yiannis Andreopoulos", "author": "Alhabib Abbas; Yiannis Andreopoulos", "abstract": "We derive analytic bounds on the noise invariance of majority vote classifiers operating on compressed inputs. Specifically, starting from recent bounds on the true risk of majority vote classifiers, we extend the applicability of PAC-Bayesian theory to quantify the resilience of majority votes to input noise stemming from compression. The derived bounds are intuitive in binary classification settings, where they can be measured as expressions of voter differentials and voter pair agreement. By combining measures of input distortion with analytic guarantees on noise invariance, we prescribe rate-efficient machines to compress inputs without affecting subsequent classification. Our validation shows how bounding noise invariance can inform the compression stage for any majority vote classifier such that worst-case implications of bad input reconstructions are known, and inputs can be compressed to the minimum amount of information needed prior to inference.", "bibtex": "@InProceedings{pmlr-v162-abbas22a,\n title = \t {{PAC}-{B}ayesian Bounds on Rate-Efficient Classifiers},\n author = {Abbas, Alhabib and Andreopoulos, Yiannis},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1--9},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/abbas22a/abbas22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/abbas22a.html},\n abstract = \t {We derive analytic bounds on the noise invariance of majority vote classifiers operating on compressed inputs. Specifically, starting from recent bounds on the true risk of majority vote classifiers, we extend the applicability of PAC-Bayesian theory to quantify the resilience of majority votes to input noise stemming from compression. The derived bounds are intuitive in binary classification settings, where they can be measured as expressions of voter differentials and voter pair agreement. By combining measures of input distortion with analytic guarantees on noise invariance, we prescribe rate-efficient machines to compress inputs without affecting subsequent classification. Our validation shows how bounding noise invariance can inform the compression stage for any majority vote classifier such that worst-case implications of bad input reconstructions are known, and inputs can be compressed to the minimum amount of information needed prior to inference.}\n}", "pdf": "https://proceedings.mlr.press/v162/abbas22a/abbas22a.pdf", "supp": "", "pdf_size": 1365856, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:M4-YfrvTONAJ:scholar.google.com/&scioq=PAC-Bayesian+Bounds+on+Rate-Efficient+Classifiers&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff": "Meme Research Ltd., London, UK+Dept. of Electronic and Electrical Eng., University College London, London, UK; Dept. of Electronic and Electrical Eng., University College London, London, UK", "aff_domain": "ucl.ac.uk;ucl.ac.uk", "email": "ucl.ac.uk;ucl.ac.uk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/abbas22a.html", "aff_unique_index": "0+1;1", "aff_unique_norm": "Meme Research Ltd.;University College London", "aff_unique_dep": ";Dept. of Electronic and Electrical Eng.", "aff_unique_url": ";https://www.ucl.ac.uk", "aff_unique_abbr": ";UCL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";London", "aff_country_unique_index": "0+0;0", "aff_country_unique": "United Kingdom" }, { "title": "PAC-Net: A Model Pruning Approach to Inductive Transfer Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17175", "id": "17175", "proceeding": "https://proceedings.mlr.press/v162/myung22a.html", "poster": "/media/PosterPDFs/ICML%202022/67c6a1e7ce56d3d6fa748ab6d9af3fd7.png?t=1657029683.1483312", "slides": "/media/icml-2022/Slides/17175.pdf", "author_site": "Sanghoon Myung, In Huh, Wonik Jang, Jae Myung Choe, Jisu Ryu, Daesin Kim, Kee-Eung Kim, Changwook Jeong", "author": "Sanghoon Myung; In Huh; Wonik Jang; Jae Myung Choe; Jisu Ryu; Daesin Kim; Kee-Eung Kim; Changwook Jeong", "abstract": "Inductive transfer learning aims to learn from a small amount of training data for the target task by utilizing a pre-trained model from the source task. Most strategies that involve large-scale deep learning models adopt initialization with the pre-trained model and fine-tuning for the target task. However, when using over-parameterized models, we can often prune the model without sacrificing the accuracy of the source task. This motivates us to adopt model pruning for transfer learning with deep learning models. In this paper, we propose PAC-Net, a simple yet effective approach for transfer learning based on pruning. PAC-Net consists of three steps: Prune, Allocate, and Calibrate (PAC). The main idea behind these steps is to identify essential weights for the source task, fine-tune on the source task by updating the essential weights, and then calibrate on the target task by updating the remaining redundant weights. Under the various and extensive set of inductive transfer learning experiments, we show that our method achieves state-of-the-art performance by a large margin.", "bibtex": "@InProceedings{pmlr-v162-myung22a,\n title = \t {{PAC}-Net: A Model Pruning Approach to Inductive Transfer Learning},\n author = {Myung, Sanghoon and Huh, In and Jang, Wonik and Choe, Jae Myung and Ryu, Jisu and Kim, Daesin and Kim, Kee-Eung and Jeong, Changwook},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16240--16252},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/myung22a/myung22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/myung22a.html},\n abstract = \t {Inductive transfer learning aims to learn from a small amount of training data for the target task by utilizing a pre-trained model from the source task. Most strategies that involve large-scale deep learning models adopt initialization with the pre-trained model and fine-tuning for the target task. However, when using over-parameterized models, we can often prune the model without sacrificing the accuracy of the source task. This motivates us to adopt model pruning for transfer learning with deep learning models. In this paper, we propose PAC-Net, a simple yet effective approach for transfer learning based on pruning. PAC-Net consists of three steps: Prune, Allocate, and Calibrate (PAC). The main idea behind these steps is to identify essential weights for the source task, fine-tune on the source task by updating the essential weights, and then calibrate on the target task by updating the remaining redundant weights. Under the various and extensive set of inductive transfer learning experiments, we show that our method achieves state-of-the-art performance by a large margin.}\n}", "pdf": "https://proceedings.mlr.press/v162/myung22a/myung22a.pdf", "supp": "", "pdf_size": 2627867, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5794519428230196170&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "CSE Team, Innovation Center, Samsung Electronics; CSE Team, Innovation Center, Samsung Electronics; CSE Team, Innovation Center, Samsung Electronics; CSE Team, Innovation Center, Samsung Electronics; CSE Team, Innovation Center, Samsung Electronics; CSE Team, Innovation Center, Samsung Electronics; Kim Jaechul Graduate School of AI, KAIST; Graduate School of Semiconductor Materials and Devices Engineering, UNIST", "aff_domain": "samsung.com; ; ; ; ; ;kaist.ac.kr;unist.ac.kr", "email": "samsung.com; ; ; ; ; ;kaist.ac.kr;unist.ac.kr", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/myung22a.html", "aff_unique_index": "0;0;0;0;0;0;1;2", "aff_unique_norm": "Samsung;KAIST;Ulsan National Institute of Science and Technology", "aff_unique_dep": "CSE Team, Innovation Center;Kim Jaechul Graduate School of AI;Graduate School of Semiconductor Materials and Devices Engineering", "aff_unique_url": "https://www.samsung.com;https://www.kaist.edu;https://www.unist.ac.kr", "aff_unique_abbr": "Samsung;KAIST;UNIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "PACE: A Parallelizable Computation Encoder for Directed Acyclic Graphs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17969", "id": "17969", "proceeding": "https://proceedings.mlr.press/v162/dong22b.html", "poster": "/media/PosterPDFs/ICML%202022/4b7a55505729b7f664e7222960e9c2d5.png?t=1657388880.571791", "slides": "", "author_site": "Zehao Dong, Muhan Zhang, Fuhai Li, Yixin Chen", "author": "Zehao Dong; Muhan Zhang; Fuhai Li; Yixin Chen", "abstract": "Optimization of directed acyclic graph (DAG) structures has many applications, such as neural architecture search (NAS) and probabilistic graphical model learning. Encoding DAGs into real vectors is a dominant component in most neural-network-based DAG optimization frameworks. Currently, most popular DAG encoders use an asynchronous message passing scheme which sequentially processes nodes according to the dependency between nodes in a DAG. That is, a node must not be processed until all its predecessors are processed. As a result, they are inherently not parallelizable. In this work, we propose a Parallelizable Attention-based Computation structure Encoder (PACE) that processes nodes simultaneously and encodes DAGs in parallel. We demonstrate the superiority of PACE through encoder-dependent optimization subroutines that search the optimal DAG structure based on the learned DAG embeddings. Experiments show that PACE not only improves the effectiveness over previous sequential DAG encoders with a significantly boosted training and inference speed, but also generates smooth latent (DAG encoding) spaces that are beneficial to downstream optimization subroutines.", "bibtex": "@InProceedings{pmlr-v162-dong22b,\n title = \t {{PACE}: A Parallelizable Computation Encoder for Directed Acyclic Graphs},\n author = {Dong, Zehao and Zhang, Muhan and Li, Fuhai and Chen, Yixin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5360--5377},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dong22b/dong22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/dong22b.html},\n abstract = \t {Optimization of directed acyclic graph (DAG) structures has many applications, such as neural architecture search (NAS) and probabilistic graphical model learning. Encoding DAGs into real vectors is a dominant component in most neural-network-based DAG optimization frameworks. Currently, most popular DAG encoders use an asynchronous message passing scheme which sequentially processes nodes according to the dependency between nodes in a DAG. That is, a node must not be processed until all its predecessors are processed. As a result, they are inherently not parallelizable. In this work, we propose a Parallelizable Attention-based Computation structure Encoder (PACE) that processes nodes simultaneously and encodes DAGs in parallel. We demonstrate the superiority of PACE through encoder-dependent optimization subroutines that search the optimal DAG structure based on the learned DAG embeddings. Experiments show that PACE not only improves the effectiveness over previous sequential DAG encoders with a significantly boosted training and inference speed, but also generates smooth latent (DAG encoding) spaces that are beneficial to downstream optimization subroutines.}\n}", "pdf": "https://proceedings.mlr.press/v162/dong22b/dong22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/dong22b-supp.zip", "pdf_size": 1146235, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11354614986119464774&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science & Engineering, Washington University in St. Louis, St. Louis, USA+Institute for Artificial Intelligence, Peking University, Beijing, China+Beijing Institute for General Artificial Intelligence, Beijing, China; Institute for Artificial Intelligence, Peking University, Beijing, China+Beijing Institute for General Artificial Intelligence, Beijing, China; Institute for Informatics and Department of Pediatrics, Washington University in St. Louis, St. Louis, USA; Department of Computer Science & Engineering, Washington University in St. Louis, St. Louis, USA", "aff_domain": "wustl.edu;pku.edu.cn;wustl.edu;cse.wustl.edu", "email": "wustl.edu;pku.edu.cn;wustl.edu;cse.wustl.edu", "github": "https://github.com/zehao-dong/PACE", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/dong22b.html", "aff_unique_index": "0+1+2;1+2;0;0", "aff_unique_norm": "Washington University in St. Louis;Peking University;Beijing Institute for General Artificial Intelligence", "aff_unique_dep": "Department of Computer Science & Engineering;Institute for Artificial Intelligence;", "aff_unique_url": "https://wustl.edu;http://www.pku.edu.cn;", "aff_unique_abbr": "WashU;PKU;", "aff_campus_unique_index": "0+1+1;1+1;0;0", "aff_campus_unique": "St. Louis;Beijing", "aff_country_unique_index": "0+1+1;1+1;0;0", "aff_country_unique": "United States;China" }, { "title": "PAGE-PG: A Simple and Loopless Variance-Reduced Policy Gradient Method with Probabilistic Gradient Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17219", "id": "17219", "proceeding": "https://proceedings.mlr.press/v162/gargiani22a.html", "poster": "/media/PosterPDFs/ICML%202022/cd5099c73f75235d60ec0e90c4a092aa_wCPOrnt.png?t=1657621751.3118806", "slides": "", "author_site": "Matilde Gargiani, Andrea Zanelli, Andrea Martinelli, Tyler Summers, John Lygeros", "author": "Matilde Gargiani; Andrea Zanelli; Andrea Martinelli; Tyler Summers; John Lygeros", "abstract": "Despite their success, policy gradient methods suffer from high variance of the gradient estimator, which can result in unsatisfactory sample complexity. Recently, numerous variance-reduced extensions of policy gradient methods with provably better sample complexity and competitive numerical performance have been proposed. After a compact survey on some of the main variance-reduced REINFORCE-type methods, we propose ProbAbilistic Gradient Estimation for Policy Gradient (PAGE-PG), a novel loopless variance-reduced policy gradient method based on a probabilistic switch between two types of update. Our method is inspired by the PAGE estimator for supervised learning and leverages importance sampling to obtain an unbiased gradient estimator. We show that PAGE-PG enjoys a $\\mathcal{O}\\left( \\epsilon^{-3} \\right)$ average sample complexity to reach an $\\epsilon$-stationary solution, which matches the sample complexity of its most competitive counterparts under the same setting. A numerical evaluation confirms the competitive performance of our method on classical control tasks.", "bibtex": "@InProceedings{pmlr-v162-gargiani22a,\n title = \t {{PAGE}-{PG}: A Simple and Loopless Variance-Reduced Policy Gradient Method with Probabilistic Gradient Estimation},\n author = {Gargiani, Matilde and Zanelli, Andrea and Martinelli, Andrea and Summers, Tyler and Lygeros, John},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7223--7240},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gargiani22a/gargiani22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gargiani22a.html},\n abstract = \t {Despite their success, policy gradient methods suffer from high variance of the gradient estimator, which can result in unsatisfactory sample complexity. Recently, numerous variance-reduced extensions of policy gradient methods with provably better sample complexity and competitive numerical performance have been proposed. After a compact survey on some of the main variance-reduced REINFORCE-type methods, we propose ProbAbilistic Gradient Estimation for Policy Gradient (PAGE-PG), a novel loopless variance-reduced policy gradient method based on a probabilistic switch between two types of update. Our method is inspired by the PAGE estimator for supervised learning and leverages importance sampling to obtain an unbiased gradient estimator. We show that PAGE-PG enjoys a $\\mathcal{O}\\left( \\epsilon^{-3} \\right)$ average sample complexity to reach an $\\epsilon$-stationary solution, which matches the sample complexity of its most competitive counterparts under the same setting. A numerical evaluation confirms the competitive performance of our method on classical control tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/gargiani22a/gargiani22a.pdf", "supp": "", "pdf_size": 1078967, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1881876592319313067&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Automatic Control Laboratory at ETH Zurich; Institute for Dynamic Systems and Control at ETH Zurich; Automatic Control Laboratory at ETH Zurich; Department of Mechanical Engineering at the University of Texas at Dallas; Automatic Control Laboratory at ETH Zurich", "aff_domain": "ethz.ch; ; ; ; ", "email": "ethz.ch; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/gargiani22a.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "ETH Zurich;University of Texas at Dallas", "aff_unique_dep": "Automatic Control Laboratory;Department of Mechanical Engineering", "aff_unique_url": "https://www.ethz.ch;https://www.utdallas.edu", "aff_unique_abbr": "ETHZ;UT Dallas", "aff_campus_unique_index": "1", "aff_campus_unique": ";Dallas", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Switzerland;United States" }, { "title": "PDE-Based Optimal Strategy for Unconstrained Online Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16329", "id": "16329", "proceeding": "https://proceedings.mlr.press/v162/zhang22d.html", "poster": "/media/PosterPDFs/ICML%202022/f4334c131c781e2a6f0a5e34814c8147.png?t=1657637211.7407992", "slides": "", "author_site": "Zhiyu Zhang, Ashok Cutkosky, Ioannis Paschalidis", "author": "Zhiyu Zhang; Ashok Cutkosky; Ioannis Paschalidis", "abstract": "Unconstrained Online Linear Optimization (OLO) is a practical problem setting to study the training of machine learning models. Existing works proposed a number of potential-based algorithms, but in general the design of these potential functions relies heavily on guessing. To streamline this workflow, we present a framework that generates new potential functions by solving a Partial Differential Equation (PDE). Specifically, when losses are 1-Lipschitz, our framework produces a novel algorithm with anytime regret bound $C\\sqrt{T}+||u||\\sqrt{2T}[\\sqrt{\\log(1+||u||/C)}+2]$, where $C$ is a user-specified constant and $u$ is any comparator unknown and unbounded a priori. Such a bound attains an optimal loss-regret trade-off without the impractical doubling trick. Moreover, a matching lower bound shows that the leading order term, including the constant multiplier $\\sqrt{2}$, is tight. To our knowledge, the proposed algorithm is the first to achieve such optimalities.", "bibtex": "@InProceedings{pmlr-v162-zhang22d,\n title = \t {{PDE}-Based Optimal Strategy for Unconstrained Online Learning},\n author = {Zhang, Zhiyu and Cutkosky, Ashok and Paschalidis, Ioannis},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26085--26115},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22d/zhang22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22d.html},\n abstract = \t {Unconstrained Online Linear Optimization (OLO) is a practical problem setting to study the training of machine learning models. Existing works proposed a number of potential-based algorithms, but in general the design of these potential functions relies heavily on guessing. To streamline this workflow, we present a framework that generates new potential functions by solving a Partial Differential Equation (PDE). Specifically, when losses are 1-Lipschitz, our framework produces a novel algorithm with anytime regret bound $C\\sqrt{T}+||u||\\sqrt{2T}[\\sqrt{\\log(1+||u||/C)}+2]$, where $C$ is a user-specified constant and $u$ is any comparator unknown and unbounded a priori. Such a bound attains an optimal loss-regret trade-off without the impractical doubling trick. Moreover, a matching lower bound shows that the leading order term, including the constant multiplier $\\sqrt{2}$, is tight. To our knowledge, the proposed algorithm is the first to achieve such optimalities.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22d/zhang22d.pdf", "supp": "", "pdf_size": 1005088, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2664380085986514830&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Boston University; Boston University; Boston University", "aff_domain": "bu.edu;cutkosky.com;bu.edu", "email": "bu.edu;cutkosky.com;bu.edu", "github": "", "project": "https://arxiv.org/abs/2201.07877", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhang22d.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Boston University", "aff_unique_dep": "", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "PDO-s3DCNNs: Partial Differential Operator Based Steerable 3D CNNs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18025", "id": "18025", "proceeding": "https://proceedings.mlr.press/v162/shen22c.html", "poster": "/media/PosterPDFs/ICML%202022/5ea1649a31336092c05438df996a3e59_K9pMgE9.png?t=1658070980.5377994", "slides": "", "author_site": "Zhengyang Shen, Tao Hong, Qi She, Jinwen Ma, Zhouchen Lin", "author": "Zhengyang Shen; Tao Hong; Qi She; Jinwen Ma; Zhouchen Lin", "abstract": "Steerable models can provide very general and flexible equivariance by formulating equivariance requirements in the language of representation theory and feature fields, which has been recognized to be effective for many vision tasks. However, deriving steerable models for 3D rotations is much more difficult than that in the 2D case, due to more complicated mathematics of 3D rotations. In this work, we employ partial differential operators (PDOs) to model 3D filters, and derive general steerable 3D CNNs, which are called PDO-s3DCNNs. We prove that the equivariant filters are subject to linear constraints, which can be solved efficiently under various conditions. As far as we know, PDO-s3DCNNs are the most general steerable CNNs for 3D rotations, in the sense that they cover all common subgroups of SO(3) and their representations, while existing methods can only be applied to specific groups and representations. Extensive experiments show that our models can preserve equivariance well in the discrete domain, and outperform previous works on SHREC\u201917 retrieval and ISBI 2012 segmentation tasks with a low network complexity.", "bibtex": "@InProceedings{pmlr-v162-shen22c,\n title = \t {{PDO}-s3{DCNN}s: Partial Differential Operator Based Steerable 3{D} {CNN}s},\n author = {Shen, Zhengyang and Hong, Tao and She, Qi and Ma, Jinwen and Lin, Zhouchen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19827--19846},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shen22c/shen22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/shen22c.html},\n abstract = \t {Steerable models can provide very general and flexible equivariance by formulating equivariance requirements in the language of representation theory and feature fields, which has been recognized to be effective for many vision tasks. However, deriving steerable models for 3D rotations is much more difficult than that in the 2D case, due to more complicated mathematics of 3D rotations. In this work, we employ partial differential operators (PDOs) to model 3D filters, and derive general steerable 3D CNNs, which are called PDO-s3DCNNs. We prove that the equivariant filters are subject to linear constraints, which can be solved efficiently under various conditions. As far as we know, PDO-s3DCNNs are the most general steerable CNNs for 3D rotations, in the sense that they cover all common subgroups of SO(3) and their representations, while existing methods can only be applied to specific groups and representations. Extensive experiments show that our models can preserve equivariance well in the discrete domain, and outperform previous works on SHREC\u201917 retrieval and ISBI 2012 segmentation tasks with a low network complexity.}\n}", "pdf": "https://proceedings.mlr.press/v162/shen22c/shen22c.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/shen22c-supp.zip", "pdf_size": 646939, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7127988507569489900&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "School of Mathematical Sciences, Peking University, Beijing, China + Bytedance AI Lab, Haidian District, Beijing, China; School of Mathematical Sciences, Peking University, Beijing, China; Bytedance AI Lab, Haidian District, Beijing, China; School of Mathematical Sciences, Peking University, Beijing, China; Key Lab. of Machine Perception (MoE), School of Artificial Intelligence, Peking University, Beijing, China + Institute for Artificial Intelligence, Peking University, Beijing, China + Pazhou Lab, Guangzhou, China", "aff_domain": "math.pku.edu.cn; ; ;math.pku.edu.cn;pku.edu.cn", "email": "math.pku.edu.cn; ; ;math.pku.edu.cn;pku.edu.cn", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/shen22c.html", "aff_unique_index": "0+1;0;1;0;0+0+2", "aff_unique_norm": "Peking University;ByteDance;Pazhou Lab", "aff_unique_dep": "School of Mathematical Sciences;AI Lab;", "aff_unique_url": "http://www.pku.edu.cn;https://www.bytedance.com;", "aff_unique_abbr": "PKU;ByteDance;", "aff_campus_unique_index": "0+0;0;0;0;0+0+1", "aff_campus_unique": "Beijing;Guangzhou", "aff_country_unique_index": "0+0;0;0;0;0+0+0", "aff_country_unique": "China" }, { "title": "PINs: Progressive Implicit Networks for Multi-Scale Neural Representations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16197", "id": "16197", "proceeding": "https://proceedings.mlr.press/v162/landgraf22a.html", "poster": "/media/PosterPDFs/ICML%202022/aaebdb8bb6b0e73f6c3c54a0ab0c6415.png?t=1657578711.2578263", "slides": "", "author_site": "Zoe Landgraf, Alexander Sorkine Hornung, ricardo cabral", "author": "Zoe Landgraf; Alexander Sorkine Hornung; Ricardo S Cabral", "abstract": "Multi-layer perceptrons (MLP) have proven to be effective scene encoders when combined with higher-dimensional projections of the input, commonly referred to as positional encoding. However, scenes with a wide frequency spectrum remain a challenge: choosing high frequencies for positional encoding introduces noise in low structure areas, while low frequencies results in poor fitting of detailed regions. To address this, we propose a progressive positional encoding, exposing a hierarchical MLP structure to incremental sets of frequency encodings. Our model accurately reconstructs scenes with wide frequency bands and learns a scene representation at progressive level of detail without explicit per-level supervision. The architecture is modular: each level encodes a continuous implicit representation that can be leveraged separately for its respective resolution, meaning a smaller network for coarser reconstructions. Experiments on several 2D and 3D datasets shows improvements in reconstruction accuracy, representational capacity and training speed compared to baselines.", "bibtex": "@InProceedings{pmlr-v162-landgraf22a,\n title = \t {{PIN}s: Progressive Implicit Networks for Multi-Scale Neural Representations},\n author = {Landgraf, Zoe and Hornung, Alexander Sorkine and Cabral, Ricardo S},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11969--11984},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/landgraf22a/landgraf22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/landgraf22a.html},\n abstract = \t {Multi-layer perceptrons (MLP) have proven to be effective scene encoders when combined with higher-dimensional projections of the input, commonly referred to as positional encoding. However, scenes with a wide frequency spectrum remain a challenge: choosing high frequencies for positional encoding introduces noise in low structure areas, while low frequencies results in poor fitting of detailed regions. To address this, we propose a progressive positional encoding, exposing a hierarchical MLP structure to incremental sets of frequency encodings. Our model accurately reconstructs scenes with wide frequency bands and learns a scene representation at progressive level of detail without explicit per-level supervision. The architecture is modular: each level encodes a continuous implicit representation that can be leveraged separately for its respective resolution, meaning a smaller network for coarser reconstructions. Experiments on several 2D and 3D datasets shows improvements in reconstruction accuracy, representational capacity and training speed compared to baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/landgraf22a/landgraf22a.pdf", "supp": "", "pdf_size": 28413610, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5190792700645623212&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computing, Imperial College of London, London, UK + Meta, Zuerich, Switzerland; Meta, Zuerich, Switzerland; Meta, Zuerich, Switzerland", "aff_domain": "fb.com; ; ", "email": "fb.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/landgraf22a.html", "aff_unique_index": "0+1;1;1", "aff_unique_norm": "Imperial College of London;Meta", "aff_unique_dep": "Department of Computing;Meta Platforms, Inc.", "aff_unique_url": "https://www.imperial.ac.uk;https://www.meta.com", "aff_unique_abbr": "Imperial College;Meta", "aff_campus_unique_index": "0+1;1;1", "aff_campus_unique": "London;Zuerich", "aff_country_unique_index": "0+1;1;1", "aff_country_unique": "United Kingdom;Switzerland" }, { "title": "PLATINUM: Semi-Supervised Model Agnostic Meta-Learning using Submodular Mutual Information", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18231", "id": "18231", "proceeding": "https://proceedings.mlr.press/v162/li22k.html", "poster": "/media/PosterPDFs/ICML%202022/c96e651946818e0787d6296f69549fe1.png?t=1657496249.461876", "slides": "/media/icml-2022/Slides/18231.pdf", "author_site": "Changbin Li, Suraj Kothawade, Feng Chen, Rishabh Iyer", "author": "Changbin Li; Suraj Kothawade; Feng Chen; Rishabh Iyer", "abstract": "Few-shot classification (FSC) requires training models using a few (typically one to five) data points per class. Meta-learning has proven to be able to learn a parametrized model for FSC by training on various other classification tasks. In this work, we propose PLATINUM (semi-suPervised modeL Agnostic meTa learnIng usiNg sUbmodular Mutual information ), a novel semi-supervised model agnostic meta learning framework that uses the submodular mutual in- formation (SMI) functions to boost the perfor- mance of FSC. PLATINUM leverages unlabeled data in the inner and outer loop using SMI func- tions during meta-training and obtains richer meta- learned parameterizations. We study the per- formance of PLATINUM in two scenarios - 1) where the unlabeled data points belong to the same set of classes as the labeled set of a cer- tain episode, and 2) where there exist out-of- distribution classes that do not belong to the la- beled set. We evaluate our method on various settings on the miniImageNet, tieredImageNet and CIFAR-FS datasets. Our experiments show that PLATINUM outperforms MAML and semi- supervised approaches like pseduo-labeling for semi-supervised FSC, especially for small ratio of labeled to unlabeled samples.", "bibtex": "@InProceedings{pmlr-v162-li22k,\n title = \t {{PLATINUM}: Semi-Supervised Model Agnostic Meta-Learning using Submodular Mutual Information},\n author = {Li, Changbin and Kothawade, Suraj and Chen, Feng and Iyer, Rishabh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12826--12842},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22k/li22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22k.html},\n abstract = \t {Few-shot classification (FSC) requires training models using a few (typically one to five) data points per class. Meta-learning has proven to be able to learn a parametrized model for FSC by training on various other classification tasks. In this work, we propose PLATINUM (semi-suPervised modeL Agnostic meTa learnIng usiNg sUbmodular Mutual information ), a novel semi-supervised model agnostic meta learning framework that uses the submodular mutual in- formation (SMI) functions to boost the perfor- mance of FSC. PLATINUM leverages unlabeled data in the inner and outer loop using SMI func- tions during meta-training and obtains richer meta- learned parameterizations. We study the per- formance of PLATINUM in two scenarios - 1) where the unlabeled data points belong to the same set of classes as the labeled set of a cer- tain episode, and 2) where there exist out-of- distribution classes that do not belong to the la- beled set. We evaluate our method on various settings on the miniImageNet, tieredImageNet and CIFAR-FS datasets. Our experiments show that PLATINUM outperforms MAML and semi- supervised approaches like pseduo-labeling for semi-supervised FSC, especially for small ratio of labeled to unlabeled samples.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22k/li22k.pdf", "supp": "", "pdf_size": 656564, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1070646536780297100&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "University of Texas at Dallas; University of Texas at Dallas; University of Texas at Dallas; University of Texas at Dallas", "aff_domain": "utdallas.edu;utdallas.edu; ; ", "email": "utdallas.edu;utdallas.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/li22k.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Texas at Dallas", "aff_unique_dep": "", "aff_unique_url": "https://www.utdallas.edu", "aff_unique_abbr": "UT Dallas", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Dallas", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "PLATON: Pruning Large Transformer Models with Upper Confidence Bound of Weight Importance", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17017", "id": "17017", "proceeding": "https://proceedings.mlr.press/v162/zhang22ao.html", "poster": "/media/PosterPDFs/ICML%202022/4ebd440d99504722d80de606ea8507da.png?t=1658107210.091692", "slides": "", "author_site": "Qingru Zhang, Simiao Zuo, Chen Liang, Alexander Bukharin, Pengcheng He, Weizhu Chen, Tuo Zhao", "author": "Qingru Zhang; Simiao Zuo; Chen Liang; Alexander Bukharin; Pengcheng He; Weizhu Chen; Tuo Zhao", "abstract": "Large Transformer-based models have exhibited superior performance in various natural language processing and computer vision tasks. However, these models contain enormous amounts of parameters, which restrict their deployment to real-world applications. To reduce the model size, researchers prune these models based on the weights\u2019 importance scores. However, such scores are usually estimated on mini-batches during training, which incurs large variability/uncertainty due to mini-batch sampling and complicated training dynamics. As a result, some crucial weights could be pruned by commonly used pruning methods because of such uncertainty, which makes training unstable and hurts generalization. To resolve this issue, we propose PLATON, which captures the uncertainty of importance scores by upper confidence bound of importance estimation. In particular, for the weights with low importance scores but high uncertainty, PLATON tends to retain them and explores their capacity. We conduct extensive experiments with several Transformer-based models on natural language understanding, question answering and image classification to validate the effectiveness of PLATON. Results demonstrate that PLATON manifests notable improvement under different sparsity levels. Our code is publicly available at https://github.com/QingruZhang/PLATON.", "bibtex": "@InProceedings{pmlr-v162-zhang22ao,\n title = \t {{PLATON}: Pruning Large Transformer Models with Upper Confidence Bound of Weight Importance},\n author = {Zhang, Qingru and Zuo, Simiao and Liang, Chen and Bukharin, Alexander and He, Pengcheng and Chen, Weizhu and Zhao, Tuo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26809--26823},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22ao/zhang22ao.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22ao.html},\n abstract = \t {Large Transformer-based models have exhibited superior performance in various natural language processing and computer vision tasks. However, these models contain enormous amounts of parameters, which restrict their deployment to real-world applications. To reduce the model size, researchers prune these models based on the weights\u2019 importance scores. However, such scores are usually estimated on mini-batches during training, which incurs large variability/uncertainty due to mini-batch sampling and complicated training dynamics. As a result, some crucial weights could be pruned by commonly used pruning methods because of such uncertainty, which makes training unstable and hurts generalization. To resolve this issue, we propose PLATON, which captures the uncertainty of importance scores by upper confidence bound of importance estimation. In particular, for the weights with low importance scores but high uncertainty, PLATON tends to retain them and explores their capacity. We conduct extensive experiments with several Transformer-based models on natural language understanding, question answering and image classification to validate the effectiveness of PLATON. Results demonstrate that PLATON manifests notable improvement under different sparsity levels. Our code is publicly available at https://github.com/QingruZhang/PLATON.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22ao/zhang22ao.pdf", "supp": "", "pdf_size": 814448, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17654209064614422018&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Georgia Institute of Technology; Georgia Institute of Technology; Georgia Institute of Technology; Georgia Institute of Technology; Microsoft Azure AI; Microsoft Azure AI; Georgia Institute of Technology", "aff_domain": "gatech.edu; ; ; ; ; ;gatech.edu", "email": "gatech.edu; ; ; ; ; ;gatech.edu", "github": "https://github.com/QingruZhang/PLATON", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/zhang22ao.html", "aff_unique_index": "0;0;0;0;1;1;0", "aff_unique_norm": "Georgia Institute of Technology;Microsoft", "aff_unique_dep": ";Azure AI", "aff_unique_url": "https://www.gatech.edu;https://azure.microsoft.com/en-us/ai", "aff_unique_abbr": "Georgia Tech;Microsoft Azure AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "PMIC: Improving Multi-Agent Reinforcement Learning with Progressive Mutual Information Collaboration", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18275", "id": "18275", "proceeding": "https://proceedings.mlr.press/v162/li22s.html", "poster": "/media/PosterPDFs/ICML%202022/2f2b265625d76a6704b08093c652fd79.png?t=1657179239.5021393", "slides": "", "author_site": "Pengyi Li, Hongyao Tang, Tianpei Yang, Xiaotian Hao, Tong Sang, Yan Zheng, Jianye Hao, Matthew Taylor, Wenyuan Tao, Zhen Wang", "author": "Pengyi Li; Hongyao Tang; Tianpei Yang; Xiaotian Hao; Tong Sang; Yan Zheng; Jianye Hao; Matthew E. Taylor; Wenyuan Tao; Zhen Wang", "abstract": "Learning to collaborate is critical in Multi-Agent Reinforcement Learning (MARL). Previous works promote collaboration by maximizing the correlation of agents\u2019 behaviors, which is typically characterized by Mutual Information (MI) in different forms. However, we reveal sub-optimal collaborative behaviors also emerge with strong correlations, and simply maximizing the MI can, surprisingly, hinder the learning towards better collaboration. To address this issue, we propose a novel MARL framework, called Progressive Mutual Information Collaboration (PMIC), for more effective MI-driven collaboration. PMIC uses a new collaboration criterion measured by the MI between global states and joint actions. Based on this criterion, the key idea of PMIC is maximizing the MI associated with superior collaborative behaviors and minimizing the MI associated with inferior ones. The two MI objectives play complementary roles by facilitating better collaborations while avoiding falling into sub-optimal ones. Experiments on a wide range of MARL benchmarks show the superior performance of PMIC compared with other algorithms.", "bibtex": "@InProceedings{pmlr-v162-li22s,\n title = \t {{PMIC}: Improving Multi-Agent Reinforcement Learning with Progressive Mutual Information Collaboration},\n author = {Li, Pengyi and Tang, Hongyao and Yang, Tianpei and Hao, Xiaotian and Sang, Tong and Zheng, Yan and Hao, Jianye and Taylor, Matthew E. and Tao, Wenyuan and Wang, Zhen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12979--12997},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22s/li22s.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22s.html},\n abstract = \t {Learning to collaborate is critical in Multi-Agent Reinforcement Learning (MARL). Previous works promote collaboration by maximizing the correlation of agents\u2019 behaviors, which is typically characterized by Mutual Information (MI) in different forms. However, we reveal sub-optimal collaborative behaviors also emerge with strong correlations, and simply maximizing the MI can, surprisingly, hinder the learning towards better collaboration. To address this issue, we propose a novel MARL framework, called Progressive Mutual Information Collaboration (PMIC), for more effective MI-driven collaboration. PMIC uses a new collaboration criterion measured by the MI between global states and joint actions. Based on this criterion, the key idea of PMIC is maximizing the MI associated with superior collaborative behaviors and minimizing the MI associated with inferior ones. The two MI objectives play complementary roles by facilitating better collaborations while avoiding falling into sub-optimal ones. Experiments on a wide range of MARL benchmarks show the superior performance of PMIC compared with other algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22s/li22s.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/li22s-supp.zip", "pdf_size": 21587427, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2755470732694105502&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "College of Intelligence and Computing, Tianjin University, China; College of Intelligence and Computing, Tianjin University, China; College of Intelligence and Computing, Tianjin University, China+University of Alberta, Canada; College of Intelligence and Computing, Tianjin University, China; College of Intelligence and Computing, Tianjin University, China; College of Intelligence and Computing, Tianjin University, China; College of Intelligence and Computing, Tianjin University, China; University of Alberta, Canada; College of Intelligence and Computing, Tianjin University, China; Northwestern Polytechnical University, China", "aff_domain": "tju.edu.cn;tju.edu.cn; ; ; ; ; ; ; ; ", "email": "tju.edu.cn;tju.edu.cn; ; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 10, "oa": "https://proceedings.mlr.press/v162/li22s.html", "aff_unique_index": "0;0;0+1;0;0;0;0;1;0;2", "aff_unique_norm": "Tianjin University;University of Alberta;Northwestern Polytechnical University", "aff_unique_dep": "College of Intelligence and Computing;;", "aff_unique_url": "http://www.tju.edu.cn;https://www.ualberta.ca;http://www.nwpu.edu.cn", "aff_unique_abbr": "Tianjin University;UAlberta;NWPU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0;0;0;0;1;0;0", "aff_country_unique": "China;Canada" }, { "title": "POEM: Out-of-Distribution Detection with Posterior Sampling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16651", "id": "16651", "proceeding": "https://proceedings.mlr.press/v162/ming22a.html", "poster": "/media/PosterPDFs/ICML%202022/13111c20aee51aeb480ecbd988cd8cc9.png?t=1658106229.5977175", "slides": "", "author_site": "Yifei Ming, Ying Fan, Yixuan Li", "author": "Yifei Ming; Ying Fan; Yixuan Li", "abstract": "Out-of-distribution (OOD) detection is indispensable for machine learning models deployed in the open world. Recently, the use of an auxiliary outlier dataset during training (also known as outlier exposure) has shown promising performance. As the sample space for potential OOD data can be prohibitively large, sampling informative outliers is essential. In this work, we propose a novel posterior sampling based outlier mining framework, POEM, which facilitates efficient use of outlier data and promotes learning a compact decision boundary between ID and OOD data for improved detection. We show that POEM establishes state-of-the-art performance on common benchmarks. Compared to the current best method that uses a greedy sampling strategy, POEM improves the relative performance by 42.0% and 24.2% (FPR95) on CIFAR-10 and CIFAR-100, respectively. We further provide theoretical insights on the effectiveness of POEM for OOD detection.", "bibtex": "@InProceedings{pmlr-v162-ming22a,\n title = \t {{POEM}: Out-of-Distribution Detection with Posterior Sampling},\n author = {Ming, Yifei and Fan, Ying and Li, Yixuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15650--15665},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ming22a/ming22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ming22a.html},\n abstract = \t {Out-of-distribution (OOD) detection is indispensable for machine learning models deployed in the open world. Recently, the use of an auxiliary outlier dataset during training (also known as outlier exposure) has shown promising performance. As the sample space for potential OOD data can be prohibitively large, sampling informative outliers is essential. In this work, we propose a novel posterior sampling based outlier mining framework, POEM, which facilitates efficient use of outlier data and promotes learning a compact decision boundary between ID and OOD data for improved detection. We show that POEM establishes state-of-the-art performance on common benchmarks. Compared to the current best method that uses a greedy sampling strategy, POEM improves the relative performance by 42.0% and 24.2% (FPR95) on CIFAR-10 and CIFAR-100, respectively. We further provide theoretical insights on the effectiveness of POEM for OOD detection.}\n}", "pdf": "https://proceedings.mlr.press/v162/ming22a/ming22a.pdf", "supp": "", "pdf_size": 2038298, "gs_citation": 131, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14373980882186283690&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": "Department of Computer Sciences, University of Wisconsin-Madison, USA; Department of Computer Sciences, University of Wisconsin-Madison, USA; Department of Computer Sciences, University of Wisconsin-Madison, USA", "aff_domain": "wisc.edu;wisc.edu;cs.wisc.edu", "email": "wisc.edu;wisc.edu;cs.wisc.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/ming22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "Department of Computer Sciences", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "POET: Training Neural Networks on Tiny Devices with Integrated Rematerialization and Paging", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18171", "id": "18171", "proceeding": "https://proceedings.mlr.press/v162/patil22b.html", "poster": "/media/PosterPDFs/ICML%202022/7bb16972da003e87724f048d76b7e0e1.png?t=1658122541.8960636", "slides": "", "author_site": "Shishir G. Patil, Paras Jain, Prabal Dutta, Ion Stoica, Joseph E Gonzalez", "author": "Shishir G. Patil; Paras Jain; Prabal Dutta; Ion Stoica; Joseph Gonzalez", "abstract": "Fine-tuning models on edge devices like mobile phones would enable privacy-preserving personalization over sensitive data. However, edge training has historically been limited to relatively small models with simple architectures because training is both memory and energy intensive. We present POET, an algorithm to enable training large neural networks on memory-scarce battery-operated edge devices. POET jointly optimizes the integrated search search spaces of rematerialization and paging, two algorithms to reduce the memory consumption of backpropagation. Given a memory budget and a run-time constraint, we formulate a mixed-integer linear program (MILP) for energy-optimal training. Our approach enables training significantly larger models on embedded devices while reducing energy consumption while not modifying mathematical correctness of backpropagation. We demonstrate that it is possible to fine-tune both ResNet-18 and BERT within the memory constraints of a Cortex-M class embedded device while outperforming current edge training methods in energy efficiency. POET is an open-source project available at https://github.com/ShishirPatil/poet", "bibtex": "@InProceedings{pmlr-v162-patil22b,\n title = \t {{POET}: Training Neural Networks on Tiny Devices with Integrated Rematerialization and Paging},\n author = {Patil, Shishir G. and Jain, Paras and Dutta, Prabal and Stoica, Ion and Gonzalez, Joseph},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17573--17583},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/patil22b/patil22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/patil22b.html},\n abstract = \t {Fine-tuning models on edge devices like mobile phones would enable privacy-preserving personalization over sensitive data. However, edge training has historically been limited to relatively small models with simple architectures because training is both memory and energy intensive. We present POET, an algorithm to enable training large neural networks on memory-scarce battery-operated edge devices. POET jointly optimizes the integrated search search spaces of rematerialization and paging, two algorithms to reduce the memory consumption of backpropagation. Given a memory budget and a run-time constraint, we formulate a mixed-integer linear program (MILP) for energy-optimal training. Our approach enables training significantly larger models on embedded devices while reducing energy consumption while not modifying mathematical correctness of backpropagation. We demonstrate that it is possible to fine-tune both ResNet-18 and BERT within the memory constraints of a Cortex-M class embedded device while outperforming current edge training methods in energy efficiency. POET is an open-source project available at https://github.com/ShishirPatil/poet}\n}", "pdf": "https://proceedings.mlr.press/v162/patil22b/patil22b.pdf", "supp": "", "pdf_size": 523207, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5184430437455623817&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "aff": "University of California Berkeley; University of California Berkeley; University of California Berkeley; University of California Berkeley; University of California Berkeley", "aff_domain": "berkeley.edu; ; ; ; ", "email": "berkeley.edu; ; ; ; ", "github": "https://github.com/ShishirPatil/poet", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/patil22b.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Pairwise Conditional Gradients without Swap Steps and Sparser Kernel Herding", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17205", "id": "17205", "proceeding": "https://proceedings.mlr.press/v162/tsuji22a.html", "poster": "/media/PosterPDFs/ICML%202022/9ec51f6eb240fb631a35864e13737bca_hyU4YPW.png?t=1657347706.7713916", "slides": "/media/icml-2022/Slides/17205.pdf", "author_site": "Kazuma Tsuji, Ken'ichiro Tanaka, Sebastian Pokutta", "author": "Kazuma K Tsuji; Ken\u2019Ichiro Tanaka; Sebastian Pokutta", "abstract": "The Pairwise Conditional Gradients (PCG) algorithm is a powerful extension of the Frank-Wolfe algorithm leading to particularly sparse solutions, which makes PCG very appealing for problems such as sparse signal recovery, sparse regression, and kernel herding. Unfortunately, PCG exhibits so-called swap steps that might not provide sufficient primal progress. The number of these bad steps is bounded by a function in the dimension and as such known guarantees do not generalize to the infinite-dimensional case, which would be needed for kernel herding. We propose a new variant of PCG, the so-called Blended Pairwise Conditional Gradients (BPCG). This new algorithm does not exhibit any swap steps, is very easy to implement, and does not require any internal gradient alignment procedures. The convergence rate of BPCG is basically that of PCG if no drop steps would occur and as such is no worse than PCG but improves and provides new rates in many cases. Moreover, we observe in the numerical experiments that BPCG\u2019s solutions are much sparser than those of PCG. We apply BPCG to the kernel herding setting, where we derive nice quadrature rules and provide numerical results demonstrating the performance of our method.", "bibtex": "@InProceedings{pmlr-v162-tsuji22a,\n title = \t {Pairwise Conditional Gradients without Swap Steps and Sparser Kernel Herding},\n author = {Tsuji, Kazuma K and Tanaka, Ken'Ichiro and Pokutta, Sebastian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21864--21883},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tsuji22a/tsuji22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tsuji22a.html},\n abstract = \t {The Pairwise Conditional Gradients (PCG) algorithm is a powerful extension of the Frank-Wolfe algorithm leading to particularly sparse solutions, which makes PCG very appealing for problems such as sparse signal recovery, sparse regression, and kernel herding. Unfortunately, PCG exhibits so-called swap steps that might not provide sufficient primal progress. The number of these bad steps is bounded by a function in the dimension and as such known guarantees do not generalize to the infinite-dimensional case, which would be needed for kernel herding. We propose a new variant of PCG, the so-called Blended Pairwise Conditional Gradients (BPCG). This new algorithm does not exhibit any swap steps, is very easy to implement, and does not require any internal gradient alignment procedures. The convergence rate of BPCG is basically that of PCG if no drop steps would occur and as such is no worse than PCG but improves and provides new rates in many cases. Moreover, we observe in the numerical experiments that BPCG\u2019s solutions are much sparser than those of PCG. We apply BPCG to the kernel herding setting, where we derive nice quadrature rules and provide numerical results demonstrating the performance of our method.}\n}", "pdf": "https://proceedings.mlr.press/v162/tsuji22a/tsuji22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/tsuji22a-supp.zip", "pdf_size": 987928, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8707081130927738276&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "aff": "MUFG Bank, Ltd., Tokyo, Japan; Graduate School of Information Science and Technology, The University of Tokyo, Tokyo, Japan + PRESTO Japan Science and Technological Agency (JST), Tokyo, Japan; AISST, Zuse Institute Berlin and Institute of Mathematics, Technische Universit\u00e4t Berlin, Berlin, Germany", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/tsuji22a.html", "aff_unique_index": "0;1+2;3", "aff_unique_norm": "MUFG Bank, Ltd.;University of Tokyo;Japan Science and Technological Agency;Technische Universit\u00e4t Berlin", "aff_unique_dep": ";Graduate School of Information Science and Technology;PRESTO;Institute of Mathematics", "aff_unique_url": "https://www.mufg.jp;https://www.u-tokyo.ac.jp;https://www.jst.go.jp;https://www.tu-berlin.de", "aff_unique_abbr": "MUFG;UTokyo;JST;TU Berlin", "aff_campus_unique_index": "0;0+0;1", "aff_campus_unique": "Tokyo;Berlin", "aff_country_unique_index": "0;0+0;1", "aff_country_unique": "Japan;Germany" }, { "title": "Parametric Visual Program Induction with Function Modularization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17855", "id": "17855", "proceeding": "https://proceedings.mlr.press/v162/duan22c.html", "poster": "/media/PosterPDFs/ICML%202022/112e9d677c7483747f216a1470bed734.png?t=1657633960.4467146", "slides": "/media/icml-2022/Slides/17855.pdf", "author_site": "Xuguang Duan, Xin Wang, Ziwei Zhang, Wenwu Zhu", "author": "Xuguang Duan; Xin Wang; Ziwei Zhang; Wenwu Zhu", "abstract": "Generating programs to describe visual observations has gained much research attention recently. However, most of the existing approaches are based on non-parametric primitive functions, making them unable to handle complex visual scenes involving many attributes and details. In this paper, we propose the concept of parametric visual program induction. Learning to generate parametric programs for visual scenes is challenging due to the huge number of function variants and the complex function correlations. To solve these challenges, we propose the method of function modularization, capable of dealing with numerous function variants and complex correlations. Specifically, we model each parametric function as a multi-head self-contained neural module to cover different function variants. Moreover, to eliminate the complex correlations between functions, we propose the hierarchical heterogeneous Monto-Carlo tree search (H2MCTS) algorithm which can provide high-quality uncorrelated supervision during training, and serve as an efficient searching technique during testing. We demonstrate the superiority of the proposed method on three visual program induction datasets involving parametric primitive functions. Experimental results show that our proposed model is able to significantly outperform the state-of-the-art baseline methods in terms of generating accurate programs.", "bibtex": "@InProceedings{pmlr-v162-duan22c,\n title = \t {Parametric Visual Program Induction with Function Modularization},\n author = {Duan, Xuguang and Wang, Xin and Zhang, Ziwei and Zhu, Wenwu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5643--5658},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/duan22c/duan22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/duan22c.html},\n abstract = \t {Generating programs to describe visual observations has gained much research attention recently. However, most of the existing approaches are based on non-parametric primitive functions, making them unable to handle complex visual scenes involving many attributes and details. In this paper, we propose the concept of parametric visual program induction. Learning to generate parametric programs for visual scenes is challenging due to the huge number of function variants and the complex function correlations. To solve these challenges, we propose the method of function modularization, capable of dealing with numerous function variants and complex correlations. Specifically, we model each parametric function as a multi-head self-contained neural module to cover different function variants. Moreover, to eliminate the complex correlations between functions, we propose the hierarchical heterogeneous Monto-Carlo tree search (H2MCTS) algorithm which can provide high-quality uncorrelated supervision during training, and serve as an efficient searching technique during testing. We demonstrate the superiority of the proposed method on three visual program induction datasets involving parametric primitive functions. Experimental results show that our proposed model is able to significantly outperform the state-of-the-art baseline methods in terms of generating accurate programs.}\n}", "pdf": "https://proceedings.mlr.press/v162/duan22c/duan22c.pdf", "supp": "", "pdf_size": 3049756, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2141545493093602309&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science and Technology, Tsinghua University, Beijing, China; Department of Computer Science and Technology, Tsinghua University, Beijing, China; Department of Computer Science and Technology, Tsinghua University, Beijing, China; Department of Computer Science and Technology, Tsinghua University, Beijing, China", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn; ; ", "email": "tsinghua.edu.cn;tsinghua.edu.cn; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/duan22c.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Department of Computer Science and Technology", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Parsimonious Learning-Augmented Caching", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16573", "id": "16573", "proceeding": "https://proceedings.mlr.press/v162/im22a.html", "poster": "/media/PosterPDFs/ICML%202022/39ae2ed11b14a4ccb41d35e9d1ba5d11.png?t=1657510113.7448173", "slides": "", "author_site": "Sungjin Im, Ravi Kumar, Aditya Petety, Manish Purohit", "author": "Sungjin Im; Ravi Kumar; Aditya Petety; Manish Purohit", "abstract": "Learning-augmented algorithms\u2014in which, traditional algorithms are augmented with machine-learned predictions\u2014have emerged as a framework to go beyond worst-case analysis. The overarching goal is to design algorithms that perform near-optimally when the predictions are accurate yet retain certain worst-case guarantees irrespective of the accuracy of the predictions. This framework has been successfully applied to online problems such as caching where the predictions can be used to alleviate uncertainties. In this paper we introduce and study the setting in which the learning-augmented algorithm can utilize the predictions parsimoniously. We consider the caching problem\u2014which has been extensively studied in the learning-augmented setting\u2014and show that one can achieve quantitatively similar results but only using a", "bibtex": "@InProceedings{pmlr-v162-im22a,\n title = \t {Parsimonious Learning-Augmented Caching},\n author = {Im, Sungjin and Kumar, Ravi and Petety, Aditya and Purohit, Manish},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9588--9601},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/im22a/im22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/im22a.html},\n abstract = \t {Learning-augmented algorithms\u2014in which, traditional algorithms are augmented with machine-learned predictions\u2014have emerged as a framework to go beyond worst-case analysis. The overarching goal is to design algorithms that perform near-optimally when the predictions are accurate yet retain certain worst-case guarantees irrespective of the accuracy of the predictions. This framework has been successfully applied to online problems such as caching where the predictions can be used to alleviate uncertainties. In this paper we introduce and study the setting in which the learning-augmented algorithm can utilize the predictions parsimoniously. We consider the caching problem\u2014which has been extensively studied in the learning-augmented setting\u2014and show that one can achieve quantitatively similar results but only using a", "pdf": "https://proceedings.mlr.press/v162/im22a/im22a.pdf", "supp": "", "pdf_size": 350891, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8942619275233260721&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "UC Merced; Google Mountain View; UC Merced; Google Mountain View", "aff_domain": "gmail.com;gmail.com; ;google.com", "email": "gmail.com;gmail.com; ;google.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/im22a.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of California, Merced;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.ucmerced.edu;https://www.google.com", "aff_unique_abbr": "UCM;Google", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "Merced;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Partial Counterfactual Identification from Observational and Experimental Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17031", "id": "17031", "proceeding": "https://proceedings.mlr.press/v162/zhang22ab.html", "poster": "/media/PosterPDFs/ICML%202022/178b0113689dce8a7e48360c3886dc99.png?t=1658116229.54417", "slides": "/media/icml-2022/Slides/17031_cj3wvZS.pdf", "author_site": "Junzhe Zhang, Jin Tian, Elias Bareinboim", "author": "Junzhe Zhang; Jin Tian; Elias Bareinboim", "abstract": "This paper investigates the problem of bounding counterfactual queries from an arbitrary collection of observational and experimental distributions and qualitative knowledge about the underlying data-generating model represented in the form of a causal diagram. We show that all counterfactual distributions in an arbitrary structural causal model (SCM) with discrete observed domains could be generated by a canonical family of SCMs with the same causal diagram where unobserved (exogenous) variables are also discrete, taking values in finite domains. Utilizing the canonical SCMs, we translate the problem of bounding counterfactuals into that of polynomial programming whose solution provides optimal bounds for the counterfactual query. Solving such polynomial programs is in general computationally expensive. We then develop effective Monte Carlo algorithms to approximate optimal bounds from a combination of observational and experimental data. Our algorithms are validated extensively on synthetic and real-world datasets.", "bibtex": "@InProceedings{pmlr-v162-zhang22ab,\n title = \t {Partial Counterfactual Identification from Observational and Experimental Data},\n author = {Zhang, Junzhe and Tian, Jin and Bareinboim, Elias},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26548--26558},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22ab/zhang22ab.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22ab.html},\n abstract = \t {This paper investigates the problem of bounding counterfactual queries from an arbitrary collection of observational and experimental distributions and qualitative knowledge about the underlying data-generating model represented in the form of a causal diagram. We show that all counterfactual distributions in an arbitrary structural causal model (SCM) with discrete observed domains could be generated by a canonical family of SCMs with the same causal diagram where unobserved (exogenous) variables are also discrete, taking values in finite domains. Utilizing the canonical SCMs, we translate the problem of bounding counterfactuals into that of polynomial programming whose solution provides optimal bounds for the counterfactual query. Solving such polynomial programs is in general computationally expensive. We then develop effective Monte Carlo algorithms to approximate optimal bounds from a combination of observational and experimental data. Our algorithms are validated extensively on synthetic and real-world datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22ab/zhang22ab.pdf", "supp": "", "pdf_size": 482184, "gs_citation": 103, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8308988910579940487&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, Columbia University; Department of Computer Science, Iowa State University; Department of Computer Science, Columbia University", "aff_domain": "cs.columbia.edu; ; ", "email": "cs.columbia.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhang22ab.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Columbia University;Iowa State University", "aff_unique_dep": "Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.columbia.edu;https://www.iastate.edu", "aff_unique_abbr": "Columbia;ISU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Partial Label Learning via Label Influence Function", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15975", "id": "15975", "proceeding": "https://proceedings.mlr.press/v162/gong22c.html", "poster": "/media/PosterPDFs/ICML%202022/b710915795b9e9c02cf10d6d2bdb688c.png?t=1657189194.1690152", "slides": "", "author_site": "Xiuwen Gong, Dong Yuan, Wei Bao", "author": "Xiuwen Gong; Dong Yuan; Wei Bao", "abstract": "To deal with ambiguities in partial label learning (PLL), state-of-the-art strategies implement disambiguations by identifying the ground-truth label directly from the candidate label set. However, these approaches usually take the label that incurs a minimal loss as the ground-truth label or use the weight to represent which label has a high likelihood to be the ground-truth label. Little work has been done to investigate from the perspective of how a candidate label changing a predictive model. In this paper, inspired by influence function, we develop a novel PLL framework called Partial Label Learning via Label Influence Function (PLL-IF). Moreover, we implement the framework with two specific representative models, an SVM model and a neural network model, which are called PLL-IF+SVM and PLL-IF+NN method respectively. Extensive experiments conducted on various datasets demonstrate the superiorities of the proposed methods in terms of prediction accuracy, which in turn validates the effectiveness of the proposed PLL-IF framework.", "bibtex": "@InProceedings{pmlr-v162-gong22c,\n title = \t {Partial Label Learning via Label Influence Function},\n author = {Gong, Xiuwen and Yuan, Dong and Bao, Wei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7665--7678},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gong22c/gong22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/gong22c.html},\n abstract = \t {To deal with ambiguities in partial label learning (PLL), state-of-the-art strategies implement disambiguations by identifying the ground-truth label directly from the candidate label set. However, these approaches usually take the label that incurs a minimal loss as the ground-truth label or use the weight to represent which label has a high likelihood to be the ground-truth label. Little work has been done to investigate from the perspective of how a candidate label changing a predictive model. In this paper, inspired by influence function, we develop a novel PLL framework called Partial Label Learning via Label Influence Function (PLL-IF). Moreover, we implement the framework with two specific representative models, an SVM model and a neural network model, which are called PLL-IF+SVM and PLL-IF+NN method respectively. Extensive experiments conducted on various datasets demonstrate the superiorities of the proposed methods in terms of prediction accuracy, which in turn validates the effectiveness of the proposed PLL-IF framework.}\n}", "pdf": "https://proceedings.mlr.press/v162/gong22c/gong22c.pdf", "supp": "", "pdf_size": 2493521, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16884636424486316158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Faculty of Engineering, The University of Sydney, NSW, Australia; Faculty of Engineering, The University of Sydney, NSW, Australia; Faculty of Engineering, The University of Sydney, NSW, Australia", "aff_domain": "sydney.edu.au; ; ", "email": "sydney.edu.au; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/gong22c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Sydney", "aff_unique_dep": "Faculty of Engineering", "aff_unique_url": "https://www.sydney.edu.au", "aff_unique_abbr": "USYD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Sydney", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Australia" }, { "title": "Partial and Asymmetric Contrastive Learning for Out-of-Distribution Detection in Long-Tailed Recognition", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16859", "id": "16859", "proceeding": "https://proceedings.mlr.press/v162/wang22aq.html", "poster": "", "slides": "", "author_site": "Haotao Wang, Aston Zhang, Yi Zhu, Shuai Zheng, Mu Li, Alex Smola, Zhangyang \u201cAtlas\u201d Wang", "author": "Haotao Wang; Aston Zhang; Yi Zhu; Shuai Zheng; Mu Li; Alex J Smola; Zhangyang Wang", "abstract": "Existing out-of-distribution (OOD) detection methods are typically benchmarked on training sets with balanced class distributions. However, in real-world applications, it is common for the training sets to have long-tailed distributions. In this work, we first demonstrate that existing OOD detection methods commonly suffer from significant performance degradation when the training set is long-tail distributed. Through analysis, we posit that this is because the models struggle to distinguish the minority tail-class in-distribution samples, from the true OOD samples, making the tail classes more prone to be falsely detected as OOD. To solve this problem, we propose Partial and Asymmetric Supervised Contrastive Learning (PASCL), which explicitly encourages the model to distinguish between tail-class in-distribution samples and OOD samples. To further boost in-distribution classification accuracy, we propose Auxiliary Branch Finetuning, which uses two separate branches of BN and classification layers for anomaly detection and in-distribution classification, respectively. The intuition is that in-distribution and OOD anomaly data have different underlying distributions. Our method outperforms previous state-of-the-art method by $1.29%$, $1.45%$, $0.69%$ anomaly detection false positive rate (FPR) and $3.24%$, $4.06%$, $7.89%$ in-distribution classification accuracy on CIFAR10-LT, CIFAR100-LT, and ImageNet-LT, respectively. Code and pre-trained models are available at https://github.com/amazon-research/long-tailed-ood-detection.", "bibtex": "@InProceedings{pmlr-v162-wang22aq,\n title = \t {Partial and Asymmetric Contrastive Learning for Out-of-Distribution Detection in Long-Tailed Recognition},\n author = {Wang, Haotao and Zhang, Aston and Zhu, Yi and Zheng, Shuai and Li, Mu and Smola, Alex J and Wang, Zhangyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23446--23458},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22aq/wang22aq.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22aq.html},\n abstract = \t {Existing out-of-distribution (OOD) detection methods are typically benchmarked on training sets with balanced class distributions. However, in real-world applications, it is common for the training sets to have long-tailed distributions. In this work, we first demonstrate that existing OOD detection methods commonly suffer from significant performance degradation when the training set is long-tail distributed. Through analysis, we posit that this is because the models struggle to distinguish the minority tail-class in-distribution samples, from the true OOD samples, making the tail classes more prone to be falsely detected as OOD. To solve this problem, we propose Partial and Asymmetric Supervised Contrastive Learning (PASCL), which explicitly encourages the model to distinguish between tail-class in-distribution samples and OOD samples. To further boost in-distribution classification accuracy, we propose Auxiliary Branch Finetuning, which uses two separate branches of BN and classification layers for anomaly detection and in-distribution classification, respectively. The intuition is that in-distribution and OOD anomaly data have different underlying distributions. Our method outperforms previous state-of-the-art method by $1.29%$, $1.45%$, $0.69%$ anomaly detection false positive rate (FPR) and $3.24%$, $4.06%$, $7.89%$ in-distribution classification accuracy on CIFAR10-LT, CIFAR100-LT, and ImageNet-LT, respectively. Code and pre-trained models are available at https://github.com/amazon-research/long-tailed-ood-detection.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22aq/wang22aq.pdf", "supp": "", "pdf_size": 632722, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14212057730611759763&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Texas at Austin; Amazon Web Services; Amazon Web Services; Amazon Web Services; Amazon Web Services; Amazon Web Services; University of Texas at Austin", "aff_domain": "utexas.edu;amazon.com; ; ; ; ;utexas.edu", "email": "utexas.edu;amazon.com; ; ; ; ;utexas.edu", "github": "https://github.com/amazon-research/long-tailed-ood-detection", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/wang22aq.html", "aff_unique_index": "0;1;1;1;1;1;0", "aff_unique_norm": "University of Texas at Austin;Amazon", "aff_unique_dep": ";Amazon Web Services", "aff_unique_url": "https://www.utexas.edu;https://aws.amazon.com", "aff_unique_abbr": "UT Austin;AWS", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Partial disentanglement for domain adaptation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18105", "id": "18105", "proceeding": "https://proceedings.mlr.press/v162/kong22a.html", "poster": "", "slides": "", "author_site": "Lingjing Kong, Shaoan Xie, Weiran Yao, Yujia Zheng, Guangyi Chen, Petar Stojanov, Victor Akinwande, Kun Zhang", "author": "Lingjing Kong; Shaoan Xie; Weiran Yao; Yujia Zheng; Guangyi Chen; Petar Stojanov; Victor Akinwande; Kun Zhang", "abstract": "Unsupervised domain adaptation is critical to many real-world applications where label information is unavailable in the target domain. In general, without further assumptions, the joint distribution of the features and the label is not identifiable in the target domain. To address this issue, we rely on a property of minimal changes of causal mechanisms across domains to minimize unnecessary influences of domain shift. To encode this property, we first formulate the data generating process using a latent variable model with two partitioned latent subspaces: invariant components whose distributions stay the same across domains, and sparse changing components that vary across domains. We further constrain the domain shift to have a restrictive influence on the changing components. Under mild conditions, we show that the latent variables are partially identifiable, from which it follows that the joint distribution of data and labels in the target domain is also identifiable. Given the theoretical insights, we propose a practical domain adaptation framework, called iMSDA. Extensive experimental results reveal that iMSDA outperforms state-of-the-art domain adaptation algorithms on benchmark datasets, demonstrating the effectiveness of our framework.", "bibtex": "@InProceedings{pmlr-v162-kong22a,\n title = \t {Partial disentanglement for domain adaptation},\n author = {Kong, Lingjing and Xie, Shaoan and Yao, Weiran and Zheng, Yujia and Chen, Guangyi and Stojanov, Petar and Akinwande, Victor and Zhang, Kun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11455--11472},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kong22a/kong22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kong22a.html},\n abstract = \t {Unsupervised domain adaptation is critical to many real-world applications where label information is unavailable in the target domain. In general, without further assumptions, the joint distribution of the features and the label is not identifiable in the target domain. To address this issue, we rely on a property of minimal changes of causal mechanisms across domains to minimize unnecessary influences of domain shift. To encode this property, we first formulate the data generating process using a latent variable model with two partitioned latent subspaces: invariant components whose distributions stay the same across domains, and sparse changing components that vary across domains. We further constrain the domain shift to have a restrictive influence on the changing components. Under mild conditions, we show that the latent variables are partially identifiable, from which it follows that the joint distribution of data and labels in the target domain is also identifiable. Given the theoretical insights, we propose a practical domain adaptation framework, called iMSDA. Extensive experimental results reveal that iMSDA outperforms state-of-the-art domain adaptation algorithms on benchmark datasets, demonstrating the effectiveness of our framework.}\n}", "pdf": "https://proceedings.mlr.press/v162/kong22a/kong22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/kong22a-supp.zip", "pdf_size": 1570652, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3964810722904249092&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Carnegie Mellon University, USA; Carnegie Mellon University, USA; Carnegie Mellon University, USA; Carnegie Mellon University, USA; Mohamed bin Zayed University of Arti\ufb01cial Intelligence, UAE; Broad Institute of MIT and Harvard, USA; Carnegie Mellon University, USA; Carnegie Mellon University, USA + Mohamed bin Zayed University of Arti\ufb01cial Intelligence, UAE", "aff_domain": "andrew.cmu.edu;andrew.cmu.edu;andrew.cmu.edu;andrew.cmu.edu;mbzuai.ac.ae;broadinstitute.org;andrew.cmu.edu;cmu.edu", "email": "andrew.cmu.edu;andrew.cmu.edu;andrew.cmu.edu;andrew.cmu.edu;mbzuai.ac.ae;broadinstitute.org;andrew.cmu.edu;cmu.edu", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/kong22a.html", "aff_unique_index": "0;0;0;0;1;2;0;0+1", "aff_unique_norm": "Carnegie Mellon University;Mohamed bin Zayed University of Artificial Intelligence;Broad Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://mbzuai.ac.ae;https://www.broadinstitute.org", "aff_unique_abbr": "CMU;MBZUAI;Broad", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0;0;0+1", "aff_country_unique": "United States;United Arab Emirates" }, { "title": "Particle Transformer for Jet Tagging", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17989", "id": "17989", "proceeding": "https://proceedings.mlr.press/v162/qu22b.html", "poster": "/media/PosterPDFs/ICML%202022/aff82e881075d9c1ec306f86ae15c833_PgVDVJs.png?t=1657712767.906831", "slides": "", "author_site": "Huilin Qu, Congqiao Li, Sitian Qian", "author": "Huilin Qu; Congqiao Li; Sitian Qian", "abstract": "Jet tagging is a critical yet challenging classification task in particle physics. While deep learning has transformed jet tagging and significantly improved performance, the lack of a large-scale public dataset impedes further enhancement. In this work, we present JetClass, a new comprehensive dataset for jet tagging. The JetClass dataset consists of 100 M jets, about two orders of magnitude larger than existing public datasets. A total of 10 types of jets are simulated, including several types unexplored for tagging so far. Based on the large dataset, we propose a new Transformer-based architecture for jet tagging, called Particle Transformer (ParT). By incorporating pairwise particle interactions in the attention mechanism, ParT achieves higher tagging performance than a plain Transformer and surpasses the previous state-of-the-art, ParticleNet, by a large margin. The pre-trained ParT models, once fine-tuned, also substantially enhance the performance on two widely adopted jet tagging benchmarks. The dataset, code and models are publicly available at https://github.com/jet-universe/particle_transformer.", "bibtex": "@InProceedings{pmlr-v162-qu22b,\n title = \t {Particle Transformer for Jet Tagging},\n author = {Qu, Huilin and Li, Congqiao and Qian, Sitian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18281--18292},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qu22b/qu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/qu22b.html},\n abstract = \t {Jet tagging is a critical yet challenging classification task in particle physics. While deep learning has transformed jet tagging and significantly improved performance, the lack of a large-scale public dataset impedes further enhancement. In this work, we present JetClass, a new comprehensive dataset for jet tagging. The JetClass dataset consists of 100 M jets, about two orders of magnitude larger than existing public datasets. A total of 10 types of jets are simulated, including several types unexplored for tagging so far. Based on the large dataset, we propose a new Transformer-based architecture for jet tagging, called Particle Transformer (ParT). By incorporating pairwise particle interactions in the attention mechanism, ParT achieves higher tagging performance than a plain Transformer and surpasses the previous state-of-the-art, ParticleNet, by a large margin. The pre-trained ParT models, once fine-tuned, also substantially enhance the performance on two widely adopted jet tagging benchmarks. The dataset, code and models are publicly available at https://github.com/jet-universe/particle_transformer.}\n}", "pdf": "https://proceedings.mlr.press/v162/qu22b/qu22b.pdf", "supp": "", "pdf_size": 2173412, "gs_citation": 139, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12329206017907212560&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "CERN, Geneva, Switzerland; School of Physics, Peking University, Beijing, China; School of Physics, Peking University, Beijing, China", "aff_domain": "cern.ch;pku.edu.cn;pku.edu.cn", "email": "cern.ch;pku.edu.cn;pku.edu.cn", "github": "https://github.com/jet-universe/particle_transformer", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/qu22b.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "CERN;Peking University", "aff_unique_dep": ";School of Physics", "aff_unique_url": "https://home.cern;http://www.pku.edu.cn", "aff_unique_abbr": "CERN;Peking U", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Geneva;Beijing", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Switzerland;China" }, { "title": "Path-Aware and Structure-Preserving Generation of Synthetically Accessible Molecules", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18315", "id": "18315", "proceeding": "https://proceedings.mlr.press/v162/noh22a.html", "poster": "/media/PosterPDFs/ICML%202022/e034fb6b66aacc1d48f445ddfb08da98_fnoL5qd.png?t=1657169717.893629", "slides": "/media/icml-2022/Slides/18315.pdf", "author_site": "Juhwan Noh, Dae-Woong Jeong, Kiyoung Kim, Sehui Han, Moontae Lee, Honglak Lee, Yousung Jung", "author": "Juhwan Noh; Dae-Woong Jeong; Kiyoung Kim; Sehui Han; Moontae Lee; Honglak Lee; Yousung Jung", "abstract": "Computational chemistry aims to autonomously design specific molecules with target functionality. Generative frameworks provide useful tools to learn continuous representations of molecules in a latent space. While modelers could optimize chemical properties, many generated molecules are not synthesizable. To design synthetically accessible molecules that preserve main structural motifs of target molecules, we propose a reaction-embedded and structure-conditioned variational autoencoder. As the latent space jointly encodes molecular structures and their reaction routes, our new sampling method that measures the path-informed structural similarity allows us to effectively generate structurally analogous synthesizable molecules. When targeting out-of-domain as well as in-domain seed structures, our model generates structurally and property-wisely similar molecules equipped with well-defined reaction paths. By focusing on the important region in chemical space, we also demonstrate that our model can design new molecules with even higher activity than the seed molecules.", "bibtex": "@InProceedings{pmlr-v162-noh22a,\n title = \t {Path-Aware and Structure-Preserving Generation of Synthetically Accessible Molecules},\n author = {Noh, Juhwan and Jeong, Dae-Woong and Kim, Kiyoung and Han, Sehui and Lee, Moontae and Lee, Honglak and Jung, Yousung},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16952--16968},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/noh22a/noh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/noh22a.html},\n abstract = \t {Computational chemistry aims to autonomously design specific molecules with target functionality. Generative frameworks provide useful tools to learn continuous representations of molecules in a latent space. While modelers could optimize chemical properties, many generated molecules are not synthesizable. To design synthetically accessible molecules that preserve main structural motifs of target molecules, we propose a reaction-embedded and structure-conditioned variational autoencoder. As the latent space jointly encodes molecular structures and their reaction routes, our new sampling method that measures the path-informed structural similarity allows us to effectively generate structurally analogous synthesizable molecules. When targeting out-of-domain as well as in-domain seed structures, our model generates structurally and property-wisely similar molecules equipped with well-defined reaction paths. By focusing on the important region in chemical space, we also demonstrate that our model can design new molecules with even higher activity than the seed molecules.}\n}", "pdf": "https://proceedings.mlr.press/v162/noh22a/noh22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/noh22a-supp.zip", "pdf_size": 3589736, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2827893446322140993&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Chemical and Biomolecular Engineering (BK21 four), Korea Advanced Institute of Science and Technology, Republic of Korea; LG AI Research, Republic of Korea; LG AI Research, Republic of Korea; LG AI Research, Republic of Korea; LG AI Research, Republic of Korea + Department of Information and Decision Sciences, University of Illinois Chicago, USA; LG AI Research, Republic of Korea; Graduate School of Artificial Intelligence, Korea Advanced Institute of Science and Technology, Republic of Korea", "aff_domain": "kaist.ac.kr; ; ; ; ; ;kaist.ac.kr", "email": "kaist.ac.kr; ; ; ; ; ;kaist.ac.kr", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/noh22a.html", "aff_unique_index": "0;1;1;1;1+2;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;LG;University of Illinois Chicago", "aff_unique_dep": "Department of Chemical and Biomolecular Engineering;LG AI Research;Department of Information and Decision Sciences", "aff_unique_url": "https://www.kaist.ac.kr;https://www.lgaires.com;https://www.uic.edu", "aff_unique_abbr": "KAIST;LG AI;UIC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;0;0;0;0+1;0;0", "aff_country_unique": "South Korea;United States" }, { "title": "Path-Gradient Estimators for Continuous Normalizing Flows", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17303", "id": "17303", "proceeding": "https://proceedings.mlr.press/v162/vaitl22a.html", "poster": "/media/PosterPDFs/ICML%202022/7417744a2bac776fabe5a09b21c707a2.png?t=1657808940.4898367", "slides": "", "author_site": "Lorenz Vaitl, Kim A. Nicoli, Shinichi Nakajima, Pan Kessel", "author": "Lorenz Vaitl; Kim Andrea Nicoli; Shinichi Nakajima; Pan Kessel", "abstract": "Recent work has established a path-gradient estimator for simple variational Gaussian distributions and has argued that the path-gradient is particularly beneficial in the regime in which the variational distribution approaches the exact target distribution. In many applications, this regime can however not be reached by a simple Gaussian variational distribution. In this work, we overcome this crucial limitation by proposing a path-gradient estimator for the considerably more expressive variational family of continuous normalizing flows. We outline an efficient algorithm to calculate this estimator and establish its superior performance empirically.", "bibtex": "@InProceedings{pmlr-v162-vaitl22a,\n title = \t {Path-Gradient Estimators for Continuous Normalizing Flows},\n author = {Vaitl, Lorenz and Nicoli, Kim Andrea and Nakajima, Shinichi and Kessel, Pan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21945--21959},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vaitl22a/vaitl22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/vaitl22a.html},\n abstract = \t {Recent work has established a path-gradient estimator for simple variational Gaussian distributions and has argued that the path-gradient is particularly beneficial in the regime in which the variational distribution approaches the exact target distribution. In many applications, this regime can however not be reached by a simple Gaussian variational distribution. In this work, we overcome this crucial limitation by proposing a path-gradient estimator for the considerably more expressive variational family of continuous normalizing flows. We outline an efficient algorithm to calculate this estimator and establish its superior performance empirically.}\n}", "pdf": "https://proceedings.mlr.press/v162/vaitl22a/vaitl22a.pdf", "supp": "", "pdf_size": 606121, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=102102598474391702&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Machine Learning Group, Department of Electrical Engineering & Computer Science, Technische Universit\u00e4t Berlin, Germany+BIFOLD - Berlin Institute for the Foundations of Learning and Data, Technische Universit\u00e4t Berlin, Berlin, Germany; Machine Learning Group, Department of Electrical Engineering & Computer Science, Technische Universit\u00e4t Berlin, Germany+BIFOLD - Berlin Institute for the Foundations of Learning and Data, Technische Universit\u00e4t Berlin, Berlin, Germany+RIKEN Center for AIP, 103-0027 Tokyo, Chuo City, Japan; Machine Learning Group, Department of Electrical Engineering & Computer Science, Technische Universit\u00e4t Berlin, Germany+BIFOLD - Berlin Institute for the Foundations of Learning and Data, Technische Universit\u00e4t Berlin, Berlin, Germany; Machine Learning Group, Department of Electrical Engineering & Computer Science, Technische Universit\u00e4t Berlin, Germany+BIFOLD - Berlin Institute for the Foundations of Learning and Data, Technische Universit\u00e4t Berlin, Berlin, Germany", "aff_domain": "tu-berlin.de;tu-berlin.de;riken.jp;tu-berlin.de", "email": "tu-berlin.de;tu-berlin.de;riken.jp;tu-berlin.de", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/vaitl22a.html", "aff_unique_index": "0+0;0+0+1;0+0;0+0", "aff_unique_norm": "Technische Universit\u00e4t Berlin;RIKEN Center for AIP", "aff_unique_dep": "Department of Electrical Engineering & Computer Science;Center for AIP", "aff_unique_url": "https://www.tu-berlin.de;https://www.riken.jp/en/", "aff_unique_abbr": "TU Berlin;RIKEN AIP", "aff_campus_unique_index": "0+0;0+0+1;0+0;0+0", "aff_campus_unique": "Berlin;Tokyo", "aff_country_unique_index": "0+0;0+0+1;0+0;0+0", "aff_country_unique": "Germany;Japan" }, { "title": "Penalizing Gradient Norm for Efficiently Improving Generalization in Deep Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18305", "id": "18305", "proceeding": "https://proceedings.mlr.press/v162/zhao22i.html", "poster": "/media/PosterPDFs/ICML%202022/5c7a3b81a677c639c76989610183c0e0.png?t=1656601254.4360032", "slides": "", "author_site": "Yang Zhao, Hao Zhang, Xiuyuan Hu", "author": "Yang Zhao; Hao Zhang; Xiuyuan Hu", "abstract": "How to train deep neural networks (DNNs) to generalize well is a central concern in deep learning, especially for severely overparameterized networks nowadays. In this paper, we propose an effective method to improve the model generalization by additionally penalizing the gradient norm of loss function during optimization. We demonstrate that confining the gradient norm of loss function could help lead the optimizers towards finding flat minima. We leverage the first-order approximation to efficiently implement the corresponding gradient to fit well in the gradient descent framework. In our experiments, we confirm that when using our methods, generalization performance of various models could be improved on different datasets. Also, we show that the recent sharpness-aware minimization method (Foretet al., 2021) is a special, but not the best, case of our method, where the best case of our method could give new state-of-art performance on these tasks. Code is available at https://github.com/zhaoyang-0204/gnp.", "bibtex": "@InProceedings{pmlr-v162-zhao22i,\n title = \t {Penalizing Gradient Norm for Efficiently Improving Generalization in Deep Learning},\n author = {Zhao, Yang and Zhang, Hao and Hu, Xiuyuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26982--26992},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhao22i/zhao22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhao22i.html},\n abstract = \t {How to train deep neural networks (DNNs) to generalize well is a central concern in deep learning, especially for severely overparameterized networks nowadays. In this paper, we propose an effective method to improve the model generalization by additionally penalizing the gradient norm of loss function during optimization. We demonstrate that confining the gradient norm of loss function could help lead the optimizers towards finding flat minima. We leverage the first-order approximation to efficiently implement the corresponding gradient to fit well in the gradient descent framework. In our experiments, we confirm that when using our methods, generalization performance of various models could be improved on different datasets. Also, we show that the recent sharpness-aware minimization method (Foretet al., 2021) is a special, but not the best, case of our method, where the best case of our method could give new state-of-art performance on these tasks. Code is available at https://github.com/zhaoyang-0204/gnp.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhao22i/zhao22i.pdf", "supp": "", "pdf_size": 549140, "gs_citation": 130, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9350049289748522587&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Electronic Engineering, Tsinghua University; Department of Electronic Engineering, Tsinghua University; Department of Electronic Engineering, Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn; ", "email": "mails.tsinghua.edu.cn;tsinghua.edu.cn; ", "github": "https://github.com/zhaoyang-0204/gnp", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhao22i.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Department of Electronic Engineering", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Perfectly Balanced: Improving Transfer and Robustness of Supervised Contrastive Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17875", "id": "17875", "proceeding": "https://proceedings.mlr.press/v162/chen22d.html", "poster": "/media/PosterPDFs/ICML%202022/7d91786c01b3931e8d94baf248608979_xn7JGqo.png?t=1657744144.5413778", "slides": "", "author_site": "Mayee Chen, Daniel Y Fu, Avanika Narayan, Michael Zhang, Zhao Song, Kayvon Fatahalian, Christopher Re", "author": "Mayee Chen; Daniel Y Fu; Avanika Narayan; Michael Zhang; Zhao Song; Kayvon Fatahalian; Christopher Re", "abstract": "An ideal learned representation should display transferability and robustness. Supervised contrastive learning (SupCon) is a promising method for training accurate models, but produces representations that do not capture these properties due to class collapse\u2014when all points in a class map to the same representation. Recent work suggests that \"spreading out\" these representations improves them, but the precise mechanism is poorly understood. We argue that creating spread alone is insufficient for better representations, since spread is invariant to permutations within classes. Instead, both the correct degree of spread and a mechanism for breaking this invariance are necessary. We first prove that adding a weighted class-conditional InfoNCE loss to SupCon controls the degree of spread. Next, we study three mechanisms to break permutation invariance: using a constrained encoder, adding a class-conditional autoencoder, and using data augmentation. We show that the latter two encourage clustering of latent subclasses under more realistic conditions than the former. Using these insights, we show that adding a properly-weighted class-conditional InfoNCE loss and a class-conditional autoencoder to SupCon achieves 11.1 points of lift on coarse-to-fine transfer across 5 standard datasets and 4.7 points on worst-group robustness on 3 datasets, setting state-of-the-art on CelebA by 11.5 points.", "bibtex": "@InProceedings{pmlr-v162-chen22d,\n title = \t {Perfectly Balanced: Improving Transfer and Robustness of Supervised Contrastive Learning},\n author = {Chen, Mayee and Fu, Daniel Y and Narayan, Avanika and Zhang, Michael and Song, Zhao and Fatahalian, Kayvon and Re, Christopher},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3090--3122},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22d/chen22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22d.html},\n abstract = \t {An ideal learned representation should display transferability and robustness. Supervised contrastive learning (SupCon) is a promising method for training accurate models, but produces representations that do not capture these properties due to class collapse\u2014when all points in a class map to the same representation. Recent work suggests that \"spreading out\" these representations improves them, but the precise mechanism is poorly understood. We argue that creating spread alone is insufficient for better representations, since spread is invariant to permutations within classes. Instead, both the correct degree of spread and a mechanism for breaking this invariance are necessary. We first prove that adding a weighted class-conditional InfoNCE loss to SupCon controls the degree of spread. Next, we study three mechanisms to break permutation invariance: using a constrained encoder, adding a class-conditional autoencoder, and using data augmentation. We show that the latter two encourage clustering of latent subclasses under more realistic conditions than the former. Using these insights, we show that adding a properly-weighted class-conditional InfoNCE loss and a class-conditional autoencoder to SupCon achieves 11.1 points of lift on coarse-to-fine transfer across 5 standard datasets and 4.7 points on worst-group robustness on 3 datasets, setting state-of-the-art on CelebA by 11.5 points.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22d/chen22d.pdf", "supp": "", "pdf_size": 2396489, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4069781946979626386&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Adobe Research; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University", "aff_domain": "cs.stanford.edu;cs.stanford.edu; ; ; ; ; ", "email": "cs.stanford.edu;cs.stanford.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/chen22d.html", "aff_unique_index": "0;0;0;0;1;0;0", "aff_unique_norm": "Stanford University;Adobe", "aff_unique_dep": "Department of Computer Science;Adobe Research", "aff_unique_url": "https://www.stanford.edu;https://research.adobe.com", "aff_unique_abbr": "Stanford;Adobe", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Permutation Search of Tensor Network Structures via Local Sampling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17821", "id": "17821", "proceeding": "https://proceedings.mlr.press/v162/li22y.html", "poster": "", "slides": "", "author_site": "Chao Li, Junhua Zeng, Zerui Tao, Qibin Zhao", "author": "Chao Li; Junhua Zeng; Zerui Tao; Qibin Zhao", "abstract": "Recent works put much effort into", "bibtex": "@InProceedings{pmlr-v162-li22y,\n title = \t {Permutation Search of Tensor Network Structures via Local Sampling},\n author = {Li, Chao and Zeng, Junhua and Tao, Zerui and Zhao, Qibin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13106--13124},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22y/li22y.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22y.html},\n abstract = \t {Recent works put much effort into", "pdf": "https://proceedings.mlr.press/v162/li22y/li22y.pdf", "supp": "", "pdf_size": 4063042, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14266729648210963776&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": "RIKEN Center for Advanced Intelligence Project (RIKEN-AIP), Tokyo, Japan+Tokyo University of Agriculture and Technology, Tokyo, Japan; School of Automation, Guangdong University of Technology, Guangzhou, China; Tokyo University of Agriculture and Technology, Tokyo, Japan; RIKEN Center for Advanced Intelligence Project (RIKEN-AIP), Tokyo, Japan", "aff_domain": "riken.jp;gdut.edu.cn;tuat.ac.jp;riken.jp", "email": "riken.jp;gdut.edu.cn;tuat.ac.jp;riken.jp", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/li22y.html", "aff_unique_index": "0+1;2;1;0", "aff_unique_norm": "RIKEN Center for Advanced Intelligence Project;Tokyo University of Agriculture and Technology;Guangdong University of Technology", "aff_unique_dep": "Advanced Intelligence Project;;School of Automation", "aff_unique_url": "https://www.riken.jp/en/research/labs/aip/;https://www.tuat.ac.jp;", "aff_unique_abbr": "RIKEN-AIP;TUAT;", "aff_campus_unique_index": "0+0;1;0;0", "aff_campus_unique": "Tokyo;Guangzhou", "aff_country_unique_index": "0+0;1;0;0", "aff_country_unique": "Japan;China" }, { "title": "Personalization Improves Privacy-Accuracy Tradeoffs in Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16591", "id": "16591", "proceeding": "https://proceedings.mlr.press/v162/bietti22a.html", "poster": "", "slides": "", "author_site": "Alberto Bietti, Chen-Yu Wei, Miroslav Dudik, John Langford, Steven Wu", "author": "Alberto Bietti; Chen-Yu Wei; Miroslav Dudik; John Langford; Steven Wu", "abstract": "Large-scale machine learning systems often involve data distributed across a collection of users. Federated learning algorithms leverage this structure by communicating model updates to a central server, rather than entire datasets. In this paper, we study stochastic optimization algorithms for a personalized federated learning setting involving local and global models subject to user-level (joint) differential privacy. While learning a private global model induces a cost of privacy, local learning is perfectly private. We provide generalization guarantees showing that coordinating local learning with private centralized learning yields a generically useful and improved tradeoff between accuracy and privacy. We illustrate our theoretical results with experiments on synthetic and real-world datasets.", "bibtex": "@InProceedings{pmlr-v162-bietti22a,\n title = \t {Personalization Improves Privacy-Accuracy Tradeoffs in Federated Learning},\n author = {Bietti, Alberto and Wei, Chen-Yu and Dudik, Miroslav and Langford, John and Wu, Steven},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1945--1962},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bietti22a/bietti22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bietti22a.html},\n abstract = \t {Large-scale machine learning systems often involve data distributed across a collection of users. Federated learning algorithms leverage this structure by communicating model updates to a central server, rather than entire datasets. In this paper, we study stochastic optimization algorithms for a personalized federated learning setting involving local and global models subject to user-level (joint) differential privacy. While learning a private global model induces a cost of privacy, local learning is perfectly private. We provide generalization guarantees showing that coordinating local learning with private centralized learning yields a generically useful and improved tradeoff between accuracy and privacy. We illustrate our theoretical results with experiments on synthetic and real-world datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/bietti22a/bietti22a.pdf", "supp": "", "pdf_size": 625864, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2803924388956334708&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Center for Data Science, New York University; University of Southern California; Microsoft Research, New York; Microsoft Research, New York; Carnegie Mellon University", "aff_domain": "nyu.edu; ; ; ; ", "email": "nyu.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/bietti22a.html", "aff_unique_index": "0;1;2;2;3", "aff_unique_norm": "New York University;University of Southern California;Microsoft;Carnegie Mellon University", "aff_unique_dep": "Center for Data Science;;Microsoft Research;", "aff_unique_url": "https://www.nyu.edu;https://www.usc.edu;https://www.microsoft.com/en-us/research;https://www.cmu.edu", "aff_unique_abbr": "NYU;USC;MSR;CMU", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "New York;Los Angeles;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Personalized Federated Learning through Local Memorization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18221", "id": "18221", "proceeding": "https://proceedings.mlr.press/v162/marfoq22a.html", "poster": "/media/PosterPDFs/ICML%202022/95a7e4252fc7bc562a711ef96884a383.png?t=1657181650.6410675", "slides": "", "author_site": "Othmane Marfoq, Giovanni Neglia, Richard Vidal, Laetitia Kameni", "author": "Othmane Marfoq; Giovanni Neglia; Richard Vidal; Laetitia Kameni", "abstract": "Federated learning allows clients to collaboratively learn statistical models while keeping their data local. Federated learning was originally used to train a unique global model to be served to all clients, but this approach might be sub-optimal when clients\u2019 local data distributions are heterogeneous. In order to tackle this limitation, recent personalized federated learning methods train a separate model for each client while still leveraging the knowledge available at other clients. In this work, we exploit the ability of deep neural networks to extract high quality vectorial representations (embeddings) from non-tabular data, e.g., images and text, to propose a personalization mechanism based on local memorization. Personalization is obtained by interpolating a collectively trained global model with a local $k$-nearest neighbors (kNN) model based on the shared representation provided by the global model. We provide generalization bounds for the proposed approach in the case of binary classification, and we show on a suite of federated datasets that this approach achieves significantly higher accuracy and fairness than state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v162-marfoq22a,\n title = \t {Personalized Federated Learning through Local Memorization},\n author = {Marfoq, Othmane and Neglia, Giovanni and Vidal, Richard and Kameni, Laetitia},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15070--15092},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/marfoq22a/marfoq22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/marfoq22a.html},\n abstract = \t {Federated learning allows clients to collaboratively learn statistical models while keeping their data local. Federated learning was originally used to train a unique global model to be served to all clients, but this approach might be sub-optimal when clients\u2019 local data distributions are heterogeneous. In order to tackle this limitation, recent personalized federated learning methods train a separate model for each client while still leveraging the knowledge available at other clients. In this work, we exploit the ability of deep neural networks to extract high quality vectorial representations (embeddings) from non-tabular data, e.g., images and text, to propose a personalization mechanism based on local memorization. Personalization is obtained by interpolating a collectively trained global model with a local $k$-nearest neighbors (kNN) model based on the shared representation provided by the global model. We provide generalization bounds for the proposed approach in the case of binary classification, and we show on a suite of federated datasets that this approach achieves significantly higher accuracy and fairness than state-of-the-art methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/marfoq22a/marfoq22a.pdf", "supp": "", "pdf_size": 944928, "gs_citation": 126, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1735959565667819081&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Inria, Universit \u00b4e C\u02c6ote d\u2019Azur, Sophia Antipolis, France; Inria, Universit \u00b4e C\u02c6ote d\u2019Azur, Sophia Antipolis, France; Accenture Labs, Sophia Antipolis, France; Accenture Labs, Sophia Antipolis, France", "aff_domain": "inria.fr; ; ; ", "email": "inria.fr; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/marfoq22a.html", "aff_unique_index": "0;0;1;1", "aff_unique_norm": "INRIA;Accenture Labs", "aff_unique_dep": ";", "aff_unique_url": "https://www.inria.fr;https://labs.accenture.com", "aff_unique_abbr": "Inria;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Sophia Antipolis", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "Personalized Federated Learning via Variational Bayesian Inference", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17301", "id": "17301", "proceeding": "https://proceedings.mlr.press/v162/zhang22o.html", "poster": "/media/PosterPDFs/ICML%202022/46ba9f2a6976570b0353203ec4474217_ko8BnIx.png?t=1656416930.1195948", "slides": "", "author_site": "Xu Zhang, Yinchuan Li, Wenpeng Li, Kaiyang Guo, Yunfeng Shao", "author": "Xu Zhang; Yinchuan Li; Wenpeng Li; Kaiyang Guo; Yunfeng Shao", "abstract": "Federated learning faces huge challenges from model overfitting due to the lack of data and statistical diversity among clients. To address these challenges, this paper proposes a novel personalized federated learning method via Bayesian variational inference named pFedBayes. To alleviate the overfitting, weight uncertainty is introduced to neural networks for clients and the server. To achieve personalization, each client updates its local distribution parameters by balancing its construction error over private data and its KL divergence with global distribution from the server. Theoretical analysis gives an upper bound of averaged generalization error and illustrates that the convergence rate of the generalization error is minimax optimal up to a logarithmic factor. Experiments show that the proposed method outperforms other advanced personalized methods on personalized models, e.g., pFedBayes respectively outperforms other SOTA algorithms by 1.25%, 0.42% and 11.71% on MNIST, FMNIST and CIFAR-10 under non-i.i.d. limited data.", "bibtex": "@InProceedings{pmlr-v162-zhang22o,\n title = \t {Personalized Federated Learning via Variational {B}ayesian Inference},\n author = {Zhang, Xu and Li, Yinchuan and Li, Wenpeng and Guo, Kaiyang and Shao, Yunfeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26293--26310},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22o/zhang22o.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22o.html},\n abstract = \t {Federated learning faces huge challenges from model overfitting due to the lack of data and statistical diversity among clients. To address these challenges, this paper proposes a novel personalized federated learning method via Bayesian variational inference named pFedBayes. To alleviate the overfitting, weight uncertainty is introduced to neural networks for clients and the server. To achieve personalization, each client updates its local distribution parameters by balancing its construction error over private data and its KL divergence with global distribution from the server. Theoretical analysis gives an upper bound of averaged generalization error and illustrates that the convergence rate of the generalization error is minimax optimal up to a logarithmic factor. Experiments show that the proposed method outperforms other advanced personalized methods on personalized models, e.g., pFedBayes respectively outperforms other SOTA algorithms by 1.25%, 0.42% and 11.71% on MNIST, FMNIST and CIFAR-10 under non-i.i.d. limited data.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22o/zhang22o.pdf", "supp": "", "pdf_size": 745207, "gs_citation": 122, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6250118205299636766&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "LSEC, Academy of Mathematics and Systems Science, Chinese Academy of Sciences, Beijing, China; Noah\u2019s Ark Lab, Huawei, Beijing, China; Noah\u2019s Ark Lab, Huawei, Beijing, China; Noah\u2019s Ark Lab, Huawei, Beijing, China; Noah\u2019s Ark Lab, Huawei, Beijing, China", "aff_domain": "huawei.com; ; ; ; ", "email": "huawei.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zhang22o.html", "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Chinese Academy of Sciences;Huawei", "aff_unique_dep": "Academy of Mathematics and Systems Science;Noah\u2019s Ark Lab", "aff_unique_url": "http://www.cas.cn;https://www.huawei.com", "aff_unique_abbr": "CAS;Huawei", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Pessimism meets VCG: Learning Dynamic Mechanism Design via Offline Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17343", "id": "17343", "proceeding": "https://proceedings.mlr.press/v162/lyu22b.html", "poster": "/media/PosterPDFs/ICML%202022/edea298442a67de045e88dfb6e5ea4a2_tKWpmdN.png?t=1657740022.1960776", "slides": "", "author_site": "Boxiang Lyu, Zhaoran Wang, Mladen Kolar, Zhuoran Yang", "author": "Boxiang Lyu; Zhaoran Wang; Mladen Kolar; Zhuoran Yang", "abstract": "Dynamic mechanism design has garnered significant attention from both computer scientists and economists in recent years. By allowing agents to interact with the seller over multiple rounds, where agents\u2019 reward functions may change with time and are state-dependent, the framework is able to model a rich class of real-world problems. In these works, the interaction between agents and sellers is often assumed to follow a Markov Decision Process (MDP). We focus on the setting where the reward and transition functions of such an MDP are not known a priori, and we are attempting to recover the optimal mechanism using an a priori collected data set. In the setting where the function approximation is employed to handle large state spaces, with only mild assumptions on the expressiveness of the function class, we are able to design a dynamic mechanism using offline reinforcement learning algorithms. Moreover, learned mechanisms approximately have three key desiderata: efficiency, individual rationality, and truthfulness. Our algorithm is based on the pessimism principle and only requires a mild assumption on the coverage of the offline data set. To the best of our knowledge, our work provides the first offline RL algorithm for dynamic mechanism design without assuming uniform coverage.", "bibtex": "@InProceedings{pmlr-v162-lyu22b,\n title = \t {Pessimism meets {VCG}: Learning Dynamic Mechanism Design via Offline Reinforcement Learning},\n author = {Lyu, Boxiang and Wang, Zhaoran and Kolar, Mladen and Yang, Zhuoran},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14601--14638},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lyu22b/lyu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/lyu22b.html},\n abstract = \t {Dynamic mechanism design has garnered significant attention from both computer scientists and economists in recent years. By allowing agents to interact with the seller over multiple rounds, where agents\u2019 reward functions may change with time and are state-dependent, the framework is able to model a rich class of real-world problems. In these works, the interaction between agents and sellers is often assumed to follow a Markov Decision Process (MDP). We focus on the setting where the reward and transition functions of such an MDP are not known a priori, and we are attempting to recover the optimal mechanism using an a priori collected data set. In the setting where the function approximation is employed to handle large state spaces, with only mild assumptions on the expressiveness of the function class, we are able to design a dynamic mechanism using offline reinforcement learning algorithms. Moreover, learned mechanisms approximately have three key desiderata: efficiency, individual rationality, and truthfulness. Our algorithm is based on the pessimism principle and only requires a mild assumption on the coverage of the offline data set. To the best of our knowledge, our work provides the first offline RL algorithm for dynamic mechanism design without assuming uniform coverage.}\n}", "pdf": "https://proceedings.mlr.press/v162/lyu22b/lyu22b.pdf", "supp": "", "pdf_size": 462419, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16860994539686110838&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Booth School of Business, University of Chicago, Chicago, IL, USA+Department of Industrial Engineering and Management Sciences, Northwestern University, Evanston, IL, USA; Department of Industrial Engineering and Management Sciences, Northwestern University, Evanston, IL, USA; Booth School of Business, University of Chicago, Chicago, IL, USA; Department of Operations Research and Financial Engineering, Princeton University, Princeton, NJ, USA", "aff_domain": "chicagobooth.edu; ; ; ", "email": "chicagobooth.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lyu22b.html", "aff_unique_index": "0+1;1;0;2", "aff_unique_norm": "University of Chicago;Northwestern University;Princeton University", "aff_unique_dep": "Booth School of Business;Department of Industrial Engineering and Management Sciences;Department of Operations Research and Financial Engineering", "aff_unique_url": "https://www.chicagobooth.edu;https://www.northwestern.edu;https://www.princeton.edu", "aff_unique_abbr": "UChicago;NU;Princeton", "aff_campus_unique_index": "0+1;1;0;2", "aff_campus_unique": "Chicago;Evanston;Princeton", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Pessimistic Minimax Value Iteration: Provably Efficient Equilibrium Learning from Offline Datasets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17657", "id": "17657", "proceeding": "https://proceedings.mlr.press/v162/zhong22b.html", "poster": "/media/PosterPDFs/ICML%202022/ac2460b56866901d732f996b82b69d31_nFVh2xQ.png?t=1656559359.4662337", "slides": "", "author_site": "Han Zhong, Wei Xiong, Jiyuan Tan, Liwei Wang, Tong Zhang, Zhaoran Wang, Zhuoran Yang", "author": "Han Zhong; Wei Xiong; Jiyuan Tan; Liwei Wang; Tong Zhang; Zhaoran Wang; Zhuoran Yang", "abstract": "We study episodic two-player zero-sum Markov games (MGs) in the offline setting, where the goal is to find an approximate Nash equilibrium (NE) policy pair based on a dataset collected a priori. When the dataset does not have uniform coverage over all policy pairs, finding an approximate NE involves challenges in three aspects: (i) distributional shift between the behavior policy and the optimal policy, (ii) function approximation to handle large state space, and (iii) minimax optimization for equilibrium solving. We propose a pessimism-based algorithm, dubbed as pessimistic minimax value iteration (PMVI), which overcomes the distributional shift by constructing pessimistic estimates of the value functions for both players and outputs a policy pair by solving a correlated coarse equilibrium based on the two value functions. Furthermore, we establish a data-dependent upper bound on the suboptimality which recovers a sublinear rate without the assumption on uniform coverage of the dataset. We also prove an information-theoretical lower bound, which shows our upper bound is nearly minimax optimal, which suggests that the data-dependent term is intrinsic. Our theoretical results also highlight a notion of \u201crelative uncertainty\u201d, which characterizes the necessary and sufficient condition for achieving sample efficiency in offline MGs. To the best of our knowledge, we provide the first nearly minimax optimal result for offline MGs with function approximation.", "bibtex": "@InProceedings{pmlr-v162-zhong22b,\n title = \t {Pessimistic Minimax Value Iteration: Provably Efficient Equilibrium Learning from Offline Datasets},\n author = {Zhong, Han and Xiong, Wei and Tan, Jiyuan and Wang, Liwei and Zhang, Tong and Wang, Zhaoran and Yang, Zhuoran},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27117--27142},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhong22b/zhong22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhong22b.html},\n abstract = \t {We study episodic two-player zero-sum Markov games (MGs) in the offline setting, where the goal is to find an approximate Nash equilibrium (NE) policy pair based on a dataset collected a priori. When the dataset does not have uniform coverage over all policy pairs, finding an approximate NE involves challenges in three aspects: (i) distributional shift between the behavior policy and the optimal policy, (ii) function approximation to handle large state space, and (iii) minimax optimization for equilibrium solving. We propose a pessimism-based algorithm, dubbed as pessimistic minimax value iteration (PMVI), which overcomes the distributional shift by constructing pessimistic estimates of the value functions for both players and outputs a policy pair by solving a correlated coarse equilibrium based on the two value functions. Furthermore, we establish a data-dependent upper bound on the suboptimality which recovers a sublinear rate without the assumption on uniform coverage of the dataset. We also prove an information-theoretical lower bound, which shows our upper bound is nearly minimax optimal, which suggests that the data-dependent term is intrinsic. Our theoretical results also highlight a notion of \u201crelative uncertainty\u201d, which characterizes the necessary and sufficient condition for achieving sample efficiency in offline MGs. To the best of our knowledge, we provide the first nearly minimax optimal result for offline MGs with function approximation.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhong22b/zhong22b.pdf", "supp": "", "pdf_size": 454687, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11542552447144160089&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Center for Data Science, Peking University + Key Laboratory of Machine Perception, MOE, School of Artificial Intelligence, Peking University; The Hong Kong University of Science and Technology; Fudan University; Key Laboratory of Machine Perception, MOE, School of Artificial Intelligence, Peking University; Google Research; Northwestern University; Yale University", "aff_domain": "stu.pku.edu.cn; ; ; ;gmail.com; ;yale.edu", "email": "stu.pku.edu.cn; ; ; ;gmail.com; ;yale.edu", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/zhong22b.html", "aff_unique_index": "0+0;1;2;0;3;4;5", "aff_unique_norm": "Peking University;Hong Kong University of Science and Technology;Fudan University;Google;Northwestern University;Yale University", "aff_unique_dep": "Center for Data Science;;;Google Research;;", "aff_unique_url": "http://www.pku.edu.cn;https://www.ust.hk;https://www.fudan.edu.cn;https://research.google;https://www.northwestern.edu;https://www.yale.edu", "aff_unique_abbr": "PKU;HKUST;Fudan;Google Research;NU;Yale", "aff_campus_unique_index": "0;2;3", "aff_campus_unique": "Beijing;;Hong Kong SAR;Mountain View", "aff_country_unique_index": "0+0;0;0;0;1;1;1", "aff_country_unique": "China;United States" }, { "title": "Pessimistic Q-Learning for Offline Reinforcement Learning: Towards Optimal Sample Complexity", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16129", "id": "16129", "proceeding": "https://proceedings.mlr.press/v162/shi22c.html", "poster": "/media/PosterPDFs/ICML%202022/4b6538a44a1dfdc2b83477cd76dee98e_ZCbNnDI.png?t=1657405354.1621068", "slides": "", "author_site": "Laixi Shi, Gen Li, Yuting Wei, Yuxin Chen, Yuejie Chi", "author": "Laixi Shi; Gen Li; Yuting Wei; Yuxin Chen; Yuejie Chi", "abstract": "Offline or batch reinforcement learning seeks to learn a near-optimal policy using history data without active exploration of the environment. To counter the insufficient coverage and sample scarcity of many offline datasets, the principle of pessimism has been recently introduced to mitigate high bias of the estimated values. While pessimistic variants of model-based algorithms (e.g., value iteration with lower confidence bounds) have been theoretically investigated, their model-free counterparts \u2014 which do not require explicit model estimation \u2014 have not been adequately studied, especially in terms of sample efficiency. To address this inadequacy, we study a pessimistic variant of Q-learning in the context of finite-horizon Markov decision processes, and characterize its sample complexity under the single-policy concentrability assumption which does not require the full coverage of the state-action space. In addition, a variance-reduced pessimistic Q-learning algorithm is proposed to achieve near-optimal sample complexity. Altogether, this work highlights the efficiency of model-free algorithms in offline RL when used in conjunction with pessimism and variance reduction.", "bibtex": "@InProceedings{pmlr-v162-shi22c,\n title = \t {Pessimistic Q-Learning for Offline Reinforcement Learning: Towards Optimal Sample Complexity},\n author = {Shi, Laixi and Li, Gen and Wei, Yuting and Chen, Yuxin and Chi, Yuejie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19967--20025},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shi22c/shi22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/shi22c.html},\n abstract = \t {Offline or batch reinforcement learning seeks to learn a near-optimal policy using history data without active exploration of the environment. To counter the insufficient coverage and sample scarcity of many offline datasets, the principle of pessimism has been recently introduced to mitigate high bias of the estimated values. While pessimistic variants of model-based algorithms (e.g., value iteration with lower confidence bounds) have been theoretically investigated, their model-free counterparts \u2014 which do not require explicit model estimation \u2014 have not been adequately studied, especially in terms of sample efficiency. To address this inadequacy, we study a pessimistic variant of Q-learning in the context of finite-horizon Markov decision processes, and characterize its sample complexity under the single-policy concentrability assumption which does not require the full coverage of the state-action space. In addition, a variance-reduced pessimistic Q-learning algorithm is proposed to achieve near-optimal sample complexity. Altogether, this work highlights the efficiency of model-free algorithms in offline RL when used in conjunction with pessimism and variance reduction.}\n}", "pdf": "https://proceedings.mlr.press/v162/shi22c/shi22c.pdf", "supp": "", "pdf_size": 1151205, "gs_citation": 116, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6419267522490315973&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Electrical and Computer Engineering, Carnegie Mellon University; Department of Statistics and Data Science, The Wharton School, University of Pennsylvania; Department of Statistics and Data Science, The Wharton School, University of Pennsylvania; Department of Statistics and Data Science, The Wharton School, University of Pennsylvania; Department of Electrical and Computer Engineering, Carnegie Mellon University", "aff_domain": "cmu.edu; ; ; ; ", "email": "cmu.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/shi22c.html", "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "Carnegie Mellon University;University of Pennsylvania", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Statistics and Data Science", "aff_unique_url": "https://www.cmu.edu;https://www.upenn.edu", "aff_unique_abbr": "CMU;UPenn", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Phasic Self-Imitative Reduction for Sparse-Reward Goal-Conditioned Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17643", "id": "17643", "proceeding": "https://proceedings.mlr.press/v162/li22g.html", "poster": "/media/PosterPDFs/ICML%202022/5034a5d62f91942d2a7aeaf527dfe111.png?t=1657419830.591254", "slides": "", "author_site": "Yunfei Li, Tian Gao, Jiaqi Yang, Huazhe Xu, Yi Wu", "author": "Yunfei Li; Tian Gao; Jiaqi Yang; Huazhe Xu; Yi Wu", "abstract": "It has been a recent trend to leverage the power of supervised learning (SL) towards more effective reinforcement learning (RL) methods. We propose a novel phasic solution by alternating online RL and offline SL for tackling sparse-reward goal-conditioned problems. In the online phase, we perform RL training and collect rollout data while in the offline phase, we perform SL on those successful trajectories from the dataset. To further improve sample efficiency, we adopt additional techniques in the online phase including task reduction to generate more feasible trajectories and a value-difference-based intrinsic reward to alleviate the sparse-reward issue. We call this overall framework, PhAsic self-Imitative Reduction (PAIR). PAIR is compatible with various online and offline RL methods and substantially outperforms both non-phasic RL and phasic SL baselines on sparse-reward robotic control problems, including a particularly challenging stacking task. PAIR is the first RL method that learns to stack 6 cubes with only 0/1 success rewards from scratch.", "bibtex": "@InProceedings{pmlr-v162-li22g,\n title = \t {Phasic Self-Imitative Reduction for Sparse-Reward Goal-Conditioned Reinforcement Learning},\n author = {Li, Yunfei and Gao, Tian and Yang, Jiaqi and Xu, Huazhe and Wu, Yi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12765--12781},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22g/li22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22g.html},\n abstract = \t {It has been a recent trend to leverage the power of supervised learning (SL) towards more effective reinforcement learning (RL) methods. We propose a novel phasic solution by alternating online RL and offline SL for tackling sparse-reward goal-conditioned problems. In the online phase, we perform RL training and collect rollout data while in the offline phase, we perform SL on those successful trajectories from the dataset. To further improve sample efficiency, we adopt additional techniques in the online phase including task reduction to generate more feasible trajectories and a value-difference-based intrinsic reward to alleviate the sparse-reward issue. We call this overall framework, PhAsic self-Imitative Reduction (PAIR). PAIR is compatible with various online and offline RL methods and substantially outperforms both non-phasic RL and phasic SL baselines on sparse-reward robotic control problems, including a particularly challenging stacking task. PAIR is the first RL method that learns to stack 6 cubes with only 0/1 success rewards from scratch.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22g/li22g.pdf", "supp": "", "pdf_size": 5446366, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17388872242068670151&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, China; Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, China; Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, CA, USA; Stanford University, CA, USA; Shanghai Qi Zhi Institute, Shanghai, China + Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, China", "aff_domain": "mails.tsinghua.edu.cn; ; ; ;gmail.com", "email": "mails.tsinghua.edu.cn; ; ; ;gmail.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/li22g.html", "aff_unique_index": "0;0;1;2;3+0", "aff_unique_norm": "Tsinghua University;University of California, Berkeley;Stanford University;Shanghai Qi Zhi Institute", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;Department of Electrical Engineering and Computer Sciences;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.berkeley.edu;https://www.stanford.edu;", "aff_unique_abbr": "Tsinghua;UC Berkeley;Stanford;", "aff_campus_unique_index": "0;0;1;2;3+0", "aff_campus_unique": "Beijing;Berkeley;California;Shanghai", "aff_country_unique_index": "0;0;1;1;0+0", "aff_country_unique": "China;United States" }, { "title": "Plan Better Amid Conservatism: Offline Multi-Agent Reinforcement Learning with Actor Rectification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17591", "id": "17591", "proceeding": "https://proceedings.mlr.press/v162/pan22a.html", "poster": "/media/PosterPDFs/ICML%202022/52947e0ade57a09e4a1386d08f17b656.png?t=1658026192.4603004", "slides": "", "author_site": "Ling Pan, Longbo Huang, Tengyu Ma, Huazhe Xu", "author": "Ling Pan; Longbo Huang; Tengyu Ma; Huazhe Xu", "abstract": "Conservatism has led to significant progress in offline reinforcement learning (RL) where an agent learns from pre-collected datasets. However, as many real-world scenarios involve interaction among multiple agents, it is important to resolve offline RL in the multi-agent setting. Given the recent success of transferring online RL algorithms to the multi-agent setting, one may expect that offline RL algorithms will also transfer to multi-agent settings directly. Surprisingly, we empirically observe that conservative offline RL algorithms do not work well in the multi-agent setting\u2014the performance degrades significantly with an increasing number of agents. Towards mitigating the degradation, we identify a key issue that non-concavity of the value function makes the policy gradient improvements prone to local optima. Multiple agents exacerbate the problem severely, since the suboptimal policy by any agent can lead to uncoordinated global failure. Following this intuition, we propose a simple yet effective method, Offline Multi-Agent RL with Actor Rectification (OMAR), which combines the first-order policy gradients and zeroth-order optimization methods to better optimize the conservative value functions over the actor parameters. Despite the simplicity, OMAR achieves state-of-the-art results in a variety of multi-agent control tasks.", "bibtex": "@InProceedings{pmlr-v162-pan22a,\n title = \t {Plan Better Amid Conservatism: Offline Multi-Agent Reinforcement Learning with Actor Rectification},\n author = {Pan, Ling and Huang, Longbo and Ma, Tengyu and Xu, Huazhe},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17221--17237},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pan22a/pan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pan22a.html},\n abstract = \t {Conservatism has led to significant progress in offline reinforcement learning (RL) where an agent learns from pre-collected datasets. However, as many real-world scenarios involve interaction among multiple agents, it is important to resolve offline RL in the multi-agent setting. Given the recent success of transferring online RL algorithms to the multi-agent setting, one may expect that offline RL algorithms will also transfer to multi-agent settings directly. Surprisingly, we empirically observe that conservative offline RL algorithms do not work well in the multi-agent setting\u2014the performance degrades significantly with an increasing number of agents. Towards mitigating the degradation, we identify a key issue that non-concavity of the value function makes the policy gradient improvements prone to local optima. Multiple agents exacerbate the problem severely, since the suboptimal policy by any agent can lead to uncoordinated global failure. Following this intuition, we propose a simple yet effective method, Offline Multi-Agent RL with Actor Rectification (OMAR), which combines the first-order policy gradients and zeroth-order optimization methods to better optimize the conservative value functions over the actor parameters. Despite the simplicity, OMAR achieves state-of-the-art results in a variety of multi-agent control tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/pan22a/pan22a.pdf", "supp": "", "pdf_size": 1736167, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14725479827547401552&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University; Institute for Interdisciplinary Information Sciences, Tsinghua University; Stanford University; Stanford University", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn;stanford.edu;stanford.edu", "email": "mails.tsinghua.edu.cn;tsinghua.edu.cn;stanford.edu;stanford.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/pan22a.html", "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Tsinghua University;Stanford University", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.stanford.edu", "aff_unique_abbr": "Tsinghua;Stanford", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Plan Your Target and Learn Your Skills: Transferable State-Only Imitation Learning via Decoupled Policy Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16291", "id": "16291", "proceeding": "https://proceedings.mlr.press/v162/liu22x.html", "poster": "/media/PosterPDFs/ICML%202022/3b8a614226a953a8cd9526fca6fe9ba5.png?t=1656577109.5146804", "slides": "/media/icml-2022/Slides/16291.pdf", "author_site": "Minghuan Liu, Zhengbang Zhu, Yuzheng Zhuang, Weinan Zhang, Jianye Hao, Yong Yu, Jun Wang", "author": "Minghuan Liu; Zhengbang Zhu; Yuzheng Zhuang; Weinan Zhang; Jianye Hao; Yong Yu; Jun Wang", "abstract": "Recent progress in state-only imitation learning extends the scope of applicability of imitation learning to real-world settings by relieving the need for observing expert actions. However, existing solutions only learn to extract a state-to-action mapping policy from the data, without considering how the expert plans to the target. This hinders the ability to leverage demonstrations and limits the flexibility of the policy. In this paper, we introduce Decoupled Policy Optimization (DePO), which explicitly decouples the policy as a high-level state planner and an inverse dynamics model. With embedded decoupled policy gradient and generative adversarial training, DePO enables knowledge transfer to different action spaces or state transition dynamics, and can generalize the planner to out-of-demonstration state regions. Our in-depth experimental analysis shows the effectiveness of DePO on learning a generalized target state planner while achieving the best imitation performance. We demonstrate the appealing usage of DePO for transferring across different tasks by pre-training, and the potential for co-training agents with various skills.", "bibtex": "@InProceedings{pmlr-v162-liu22x,\n title = \t {Plan Your Target and Learn Your Skills: Transferable State-Only Imitation Learning via Decoupled Policy Optimization},\n author = {Liu, Minghuan and Zhu, Zhengbang and Zhuang, Yuzheng and Zhang, Weinan and Hao, Jianye and Yu, Yong and Wang, Jun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14173--14196},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22x/liu22x.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22x.html},\n abstract = \t {Recent progress in state-only imitation learning extends the scope of applicability of imitation learning to real-world settings by relieving the need for observing expert actions. However, existing solutions only learn to extract a state-to-action mapping policy from the data, without considering how the expert plans to the target. This hinders the ability to leverage demonstrations and limits the flexibility of the policy. In this paper, we introduce Decoupled Policy Optimization (DePO), which explicitly decouples the policy as a high-level state planner and an inverse dynamics model. With embedded decoupled policy gradient and generative adversarial training, DePO enables knowledge transfer to different action spaces or state transition dynamics, and can generalize the planner to out-of-demonstration state regions. Our in-depth experimental analysis shows the effectiveness of DePO on learning a generalized target state planner while achieving the best imitation performance. We demonstrate the appealing usage of DePO for transferring across different tasks by pre-training, and the potential for co-training agents with various skills.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22x/liu22x.pdf", "supp": "", "pdf_size": 15287837, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9268617750672315450&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff": "Shanghai Jiao Tong University; Shanghai Jiao Tong University; Huawei Noah\u2019s Ark Lab; Shanghai Jiao Tong University; Huawei Noah\u2019s Ark Lab + Tianjin University; Shanghai Jiao Tong University; Huawei Noah\u2019s Ark Lab", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;huawei.com;sjtu.edu.cn;tju.edu.cn;sjtu.edu.cn;huawei.com", "email": "sjtu.edu.cn;sjtu.edu.cn;huawei.com;sjtu.edu.cn;tju.edu.cn;sjtu.edu.cn;huawei.com", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/liu22x.html", "aff_unique_index": "0;0;1;0;1+2;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Huawei;Tianjin University", "aff_unique_dep": ";Noah\u2019s Ark Lab;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.huawei.com;http://www.tju.edu.cn", "aff_unique_abbr": "SJTU;Huawei;TJU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0+0;0;0", "aff_country_unique": "China" }, { "title": "Planning with Diffusion for Flexible Behavior Synthesis", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18291", "id": "18291", "proceeding": "https://proceedings.mlr.press/v162/janner22a.html", "poster": "/media/PosterPDFs/ICML%202022/a4bd4d2b1cc64abf1fffb8103da2b890_pPVy71N.png?t=1658120756.344099", "slides": "/media/icml-2022/Slides/18291.pdf", "author_site": "Michael Janner, Yilun Du, Josh Tenenbaum, Sergey Levine", "author": "Michael Janner; Yilun Du; Joshua Tenenbaum; Sergey Levine", "abstract": "Model-based reinforcement learning methods often use learning only for the purpose of recovering an approximate dynamics model, offloading the rest of the decision-making work to classical trajectory optimizers. While conceptually simple, this combination has a number of empirical shortcomings, suggesting that learned models may not be well-suited to standard trajectory optimization. In this paper, we consider what it would look like to fold as much of the trajectory optimization pipeline as possible into the modeling problem, such that sampling from the model and planning with it become nearly identical. The core of our technical approach lies in a diffusion probabilistic model that plans by iteratively denoising trajectories. We show how classifier-guided sampling and image inpainting can be reinterpreted as coherent planning strategies, explore the unusual and useful properties of diffusion-based planning methods, and demonstrate the effectiveness of our framework in control settings that emphasize long-horizon decision-making and test-time flexibility.", "bibtex": "@InProceedings{pmlr-v162-janner22a,\n title = \t {Planning with Diffusion for Flexible Behavior Synthesis},\n author = {Janner, Michael and Du, Yilun and Tenenbaum, Joshua and Levine, Sergey},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9902--9915},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/janner22a/janner22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/janner22a.html},\n abstract = \t {Model-based reinforcement learning methods often use learning only for the purpose of recovering an approximate dynamics model, offloading the rest of the decision-making work to classical trajectory optimizers. While conceptually simple, this combination has a number of empirical shortcomings, suggesting that learned models may not be well-suited to standard trajectory optimization. In this paper, we consider what it would look like to fold as much of the trajectory optimization pipeline as possible into the modeling problem, such that sampling from the model and planning with it become nearly identical. The core of our technical approach lies in a diffusion probabilistic model that plans by iteratively denoising trajectories. We show how classifier-guided sampling and image inpainting can be reinterpreted as coherent planning strategies, explore the unusual and useful properties of diffusion-based planning methods, and demonstrate the effectiveness of our framework in control settings that emphasize long-horizon decision-making and test-time flexibility.}\n}", "pdf": "https://proceedings.mlr.press/v162/janner22a/janner22a.pdf", "supp": "", "pdf_size": 11114477, "gs_citation": 764, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17441916079353459921&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of California, Berkeley; MIT; MIT; University of California, Berkeley", "aff_domain": "berkeley.edu;mit.edu; ; ", "email": "berkeley.edu;mit.edu; ; ", "github": "https://github.com/diffusion-planning", "project": "https://diffusion-planning.github.io", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/janner22a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of California, Berkeley;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://web.mit.edu", "aff_unique_abbr": "UC Berkeley;MIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Plug & Play Attacks: Towards Robust and Flexible Model Inversion Attacks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16275", "id": "16275", "proceeding": "https://proceedings.mlr.press/v162/struppek22a.html", "poster": "/media/PosterPDFs/ICML%202022/6883966fd8f918a4aa29be29d2c386fb.png?t=1656575834.00467", "slides": "", "author_site": "Lukas Struppek, Dominik Hintersdorf, Antonio De Almeida Correia, Antonia Adler, Kristian Kersting", "author": "Lukas Struppek; Dominik Hintersdorf; Antonio De Almeida Correira; Antonia Adler; Kristian Kersting", "abstract": "Model inversion attacks (MIAs) aim to create synthetic images that reflect the class-wise characteristics from a target classifier\u2019s private training data by exploiting the model\u2019s learned knowledge. Previous research has developed generative MIAs that use generative adversarial networks (GANs) as image priors tailored to a specific target model. This makes the attacks time- and resource-consuming, inflexible, and susceptible to distributional shifts between datasets. To overcome these drawbacks, we present Plug & Play Attacks, which relax the dependency between the target model and image prior, and enable the use of a single GAN to attack a wide range of targets, requiring only minor adjustments to the attack. Moreover, we show that powerful MIAs are possible even with publicly available pre-trained GANs and under strong distributional shifts, for which previous approaches fail to produce meaningful results. Our extensive evaluation confirms the improved robustness and flexibility of Plug & Play Attacks and their ability to create high-quality images revealing sensitive class characteristics.", "bibtex": "@InProceedings{pmlr-v162-struppek22a,\n title = \t {Plug & Play Attacks: Towards Robust and Flexible Model Inversion Attacks},\n author = {Struppek, Lukas and Hintersdorf, Dominik and De Almeida Correira, Antonio and Adler, Antonia and Kersting, Kristian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20522--20545},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/struppek22a/struppek22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/struppek22a.html},\n abstract = \t {Model inversion attacks (MIAs) aim to create synthetic images that reflect the class-wise characteristics from a target classifier\u2019s private training data by exploiting the model\u2019s learned knowledge. Previous research has developed generative MIAs that use generative adversarial networks (GANs) as image priors tailored to a specific target model. This makes the attacks time- and resource-consuming, inflexible, and susceptible to distributional shifts between datasets. To overcome these drawbacks, we present Plug & Play Attacks, which relax the dependency between the target model and image prior, and enable the use of a single GAN to attack a wide range of targets, requiring only minor adjustments to the attack. Moreover, we show that powerful MIAs are possible even with publicly available pre-trained GANs and under strong distributional shifts, for which previous approaches fail to produce meaningful results. Our extensive evaluation confirms the improved robustness and flexibility of Plug & Play Attacks and their ability to create high-quality images revealing sensitive class characteristics.}\n}", "pdf": "https://proceedings.mlr.press/v162/struppek22a/struppek22a.pdf", "supp": "", "pdf_size": 11575027, "gs_citation": 97, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10382805845190184141&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science, Technical University of Darmstadt, Germany; Department of Computer Science, Technical University of Darmstadt, Germany; Department of Computer Science, Technical University of Darmstadt, Germany; Universit\u00a8at der Bundeswehr M\u00a8unchen, Munich, Germany; Department of Computer Science, Technical University of Darmstadt, Germany + Centre for Cognitive Science, TU Darmstadt, Germany + Hessian Center for AI (hessian.AI), Germany", "aff_domain": "cs.tu-darmstadt.de; ; ; ; ", "email": "cs.tu-darmstadt.de; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/struppek22a.html", "aff_unique_index": "0;0;0;1;0+2+3", "aff_unique_norm": "Technical University of Darmstadt;Universit\u00e4t der Bundeswehr M\u00fcnchen;Technische Universit\u00e4t Darmstadt;Hessian Center for AI", "aff_unique_dep": "Department of Computer Science;;Centre for Cognitive Science;AI Research", "aff_unique_url": "https://www.tu-darmstadt.de;https://www.unibw.de;https://www.tu-darmstadt.de;https://hessian.ai", "aff_unique_abbr": "TUD;UniBw;TU Darmstadt;hessian.AI", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Munich", "aff_country_unique_index": "0;0;0;0;0+0+0", "aff_country_unique": "Germany" }, { "title": "Plug-In Inversion: Model-Agnostic Inversion for Vision with Data Augmentations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16375", "id": "16375", "proceeding": "https://proceedings.mlr.press/v162/ghiasi22a.html", "poster": "/media/PosterPDFs/ICML%202022/df334b223e699294764c2bb7ae40d8db.png?t=1658102258.7534435", "slides": "/media/icml-2022/Slides/16375_PQ5ItL9.pdf", "author_site": "Amin Ghiasi, Hamid Kazemi, Steven Reich, Chen Zhu, Micah Goldblum, Tom Goldstein", "author": "Amin Ghiasi; Hamid Kazemi; Steven Reich; Chen Zhu; Micah Goldblum; Tom Goldstein", "abstract": "Existing techniques for model inversion typically rely on hard-to-tune regularizers, such as total variation or feature regularization, which must be individually calibrated for each network in order to produce adequate images. In this work, we introduce Plug-In Inversion, which relies on a simple set of augmentations and does not require excessive hyper-parameter tuning. Under our proposed augmentation-based scheme, the same set of augmentation hyper-parameters can be used for inverting a wide range of image classification models, regardless of input dimensions or the architecture. We illustrate the practicality of our approach by inverting Vision Transformers (ViTs) and Multi-Layer Perceptrons (MLPs) trained on the ImageNet dataset, tasks which to the best of our knowledge have not been successfully accomplished by any previous works.", "bibtex": "@InProceedings{pmlr-v162-ghiasi22a,\n title = \t {Plug-In Inversion: Model-Agnostic Inversion for Vision with Data Augmentations},\n author = {Ghiasi, Amin and Kazemi, Hamid and Reich, Steven and Zhu, Chen and Goldblum, Micah and Goldstein, Tom},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7484--7512},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ghiasi22a/ghiasi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ghiasi22a.html},\n abstract = \t {Existing techniques for model inversion typically rely on hard-to-tune regularizers, such as total variation or feature regularization, which must be individually calibrated for each network in order to produce adequate images. In this work, we introduce Plug-In Inversion, which relies on a simple set of augmentations and does not require excessive hyper-parameter tuning. Under our proposed augmentation-based scheme, the same set of augmentation hyper-parameters can be used for inverting a wide range of image classification models, regardless of input dimensions or the architecture. We illustrate the practicality of our approach by inverting Vision Transformers (ViTs) and Multi-Layer Perceptrons (MLPs) trained on the ImageNet dataset, tasks which to the best of our knowledge have not been successfully accomplished by any previous works.}\n}", "pdf": "https://proceedings.mlr.press/v162/ghiasi22a/ghiasi22a.pdf", "supp": "", "pdf_size": 36475741, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3783911125052785325&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, University of Maryland, College Park, USA+New York University Center for Data Science, New York, USA; Department of Computer Science, University of Maryland, College Park, USA+New York University Center for Data Science, New York, USA; Department of Computer Science, University of Maryland, College Park, USA+New York University Center for Data Science, New York, USA; Department of Computer Science, University of Maryland, College Park, USA; New York University Center for Data Science, New York, USA; Department of Computer Science, University of Maryland, College Park, USA", "aff_domain": "umd.edu;umd.edu; ; ; ; ", "email": "umd.edu;umd.edu; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/ghiasi22a.html", "aff_unique_index": "0+1;0+1;0+1;0;1;0", "aff_unique_norm": "University of Maryland, College Park;New York University", "aff_unique_dep": "Department of Computer Science;Center for Data Science", "aff_unique_url": "https://www/umd.edu;https://www.nyu.edu", "aff_unique_abbr": "UMD;NYU", "aff_campus_unique_index": "0+1;0+1;0+1;0;1;0", "aff_campus_unique": "College Park;New York", "aff_country_unique_index": "0+0;0+0;0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "PoF: Post-Training of Feature Extractor for Improving Generalization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17727", "id": "17727", "proceeding": "https://proceedings.mlr.press/v162/sato22a.html", "poster": "/media/PosterPDFs/ICML%202022/e702e51da2c0f5be4dd354bb3e295d37.png?t=1657271821.5870442", "slides": "", "author_site": "Ikuro Sato, Yamada Ryota, Masayuki Tanaka, Nakamasa Inoue, Rei Kawakami", "author": "Ikuro Sato; Yamada Ryota; Masayuki Tanaka; Nakamasa Inoue; Rei Kawakami", "abstract": "It has been intensively investigated that the local shape, especially flatness, of the loss landscape near a minimum plays an important role for generalization of deep models. We developed a training algorithm called PoF: Post-Training of Feature Extractor that updates the feature extractor part of an already-trained deep model to search a flatter minimum. The characteristics are two-fold: 1) Feature extractor is trained under parameter perturbations in the higher-layer parameter space, based on observations that suggest flattening higher-layer parameter space, and 2) the perturbation range is determined in a data-driven manner aiming to reduce a part of test loss caused by the positive loss curvature. We provide a theoretical analysis that shows the proposed algorithm implicitly reduces the target Hessian components as well as the loss. Experimental results show that PoF improved model performance against baseline methods on both CIFAR-10 and CIFAR-100 datasets for only 10-epoch post-training, and on SVHN dataset for 50-epoch post-training.", "bibtex": "@InProceedings{pmlr-v162-sato22a,\n title = \t {{P}o{F}: Post-Training of Feature Extractor for Improving Generalization},\n author = {Sato, Ikuro and Ryota, Yamada and Tanaka, Masayuki and Inoue, Nakamasa and Kawakami, Rei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19221--19230},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sato22a/sato22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sato22a.html},\n abstract = \t {It has been intensively investigated that the local shape, especially flatness, of the loss landscape near a minimum plays an important role for generalization of deep models. We developed a training algorithm called PoF: Post-Training of Feature Extractor that updates the feature extractor part of an already-trained deep model to search a flatter minimum. The characteristics are two-fold: 1) Feature extractor is trained under parameter perturbations in the higher-layer parameter space, based on observations that suggest flattening higher-layer parameter space, and 2) the perturbation range is determined in a data-driven manner aiming to reduce a part of test loss caused by the positive loss curvature. We provide a theoretical analysis that shows the proposed algorithm implicitly reduces the target Hessian components as well as the loss. Experimental results show that PoF improved model performance against baseline methods on both CIFAR-10 and CIFAR-100 datasets for only 10-epoch post-training, and on SVHN dataset for 50-epoch post-training.}\n}", "pdf": "https://proceedings.mlr.press/v162/sato22a/sato22a.pdf", "supp": "", "pdf_size": 854736, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1799078834754218861&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "School of Computing, Tokyo Institute of Technology, Japan+Denso IT Laboratory, inc., Japan; School of Computing, Tokyo Institute of Technology, Japan+Denso IT Laboratory, inc., Japan; School of Computing, Tokyo Institute of Technology, Japan; School of Computing, Tokyo Institute of Technology, Japan; School of Computing, Tokyo Institute of Technology, Japan+Denso IT Laboratory, inc., Japan", "aff_domain": "c.titech.ac.jp; ; ; ; ", "email": "c.titech.ac.jp; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/sato22a.html", "aff_unique_index": "0+1;0+1;0;0;0+1", "aff_unique_norm": "Tokyo Institute of Technology;Denso IT Laboratory, Inc.", "aff_unique_dep": "School of Computing;", "aff_unique_url": "https://www.titech.ac.jp;https://www.denso.com", "aff_unique_abbr": "Titech;Denso IT", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Tokyo;", "aff_country_unique_index": "0+0;0+0;0;0;0+0", "aff_country_unique": "Japan" }, { "title": "Pocket2Mol: Efficient Molecular Sampling Based on 3D Protein Pockets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17849", "id": "17849", "proceeding": "https://proceedings.mlr.press/v162/peng22b.html", "poster": "/media/PosterPDFs/ICML%202022/ac3e2c4e1d4bd07fb973a2ea4d250160.png?t=1657451975.9683561", "slides": "", "author_site": "Xingang Peng, Shitong Luo, Jiaqi Guan, Qi Xie, Jian Peng, Jianzhu Ma", "author": "Xingang Peng; Shitong Luo; Jiaqi Guan; Qi Xie; Jian Peng; Jianzhu Ma", "abstract": "Deep generative models have achieved tremendous success in designing novel drug molecules in recent years. A new thread of works have shown potential in advancing the specificity and success rate of in silico drug design by considering the structure of protein pockets. This setting posts fundamental computational challenges in sampling new chemical compounds that could satisfy multiple geometrical constraints imposed by pockets. Previous sampling algorithms either sample in the graph space or only consider the 3D coordinates of atoms while ignoring other detailed chemical structures such as bond types and functional groups. To address the challenge, we develop an E(3)-equivariant generative network composed of two modules: 1) a new graph neural network capturing both spatial and bonding relationships between atoms of the binding pockets and 2) a new efficient algorithm which samples new drug candidates conditioned on the pocket representations from a tractable distribution without relying on MCMC. Experimental results demonstrate that molecules sampled from Pocket2Mol achieve significantly better binding affinity and other drug properties such as drug-likeness and synthetic accessibility.", "bibtex": "@InProceedings{pmlr-v162-peng22b,\n title = \t {{P}ocket2{M}ol: Efficient Molecular Sampling Based on 3{D} Protein Pockets},\n author = {Peng, Xingang and Luo, Shitong and Guan, Jiaqi and Xie, Qi and Peng, Jian and Ma, Jianzhu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17644--17655},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/peng22b/peng22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/peng22b.html},\n abstract = \t {Deep generative models have achieved tremendous success in designing novel drug molecules in recent years. A new thread of works have shown potential in advancing the specificity and success rate of in silico drug design by considering the structure of protein pockets. This setting posts fundamental computational challenges in sampling new chemical compounds that could satisfy multiple geometrical constraints imposed by pockets. Previous sampling algorithms either sample in the graph space or only consider the 3D coordinates of atoms while ignoring other detailed chemical structures such as bond types and functional groups. To address the challenge, we develop an E(3)-equivariant generative network composed of two modules: 1) a new graph neural network capturing both spatial and bonding relationships between atoms of the binding pockets and 2) a new efficient algorithm which samples new drug candidates conditioned on the pocket representations from a tractable distribution without relying on MCMC. Experimental results demonstrate that molecules sampled from Pocket2Mol achieve significantly better binding affinity and other drug properties such as drug-likeness and synthetic accessibility.}\n}", "pdf": "https://proceedings.mlr.press/v162/peng22b/peng22b.pdf", "supp": "", "pdf_size": 6593051, "gs_citation": 246, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5422392293509643070&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/peng22b.html" }, { "title": "Policy Diagnosis via Measuring Role Diversity in Cooperative Multi-agent RL", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17467", "id": "17467", "proceeding": "https://proceedings.mlr.press/v162/hu22c.html", "poster": "/media/PosterPDFs/ICML%202022/a6ea8471c120fe8cc35a2954c9b9c595.png?t=1657515006.0094264", "slides": "", "author_site": "Siyi Hu, Chuanlong Xie, Xiaodan Liang, Xiaojun Chang", "author": "Siyi Hu; Chuanlong Xie; Xiaodan Liang; Xiaojun Chang", "abstract": "Cooperative multi-agent reinforcement learning (MARL) is making rapid progress for solving tasks in a grid world and real-world scenarios, in which agents are given different attributes and goals, resulting in different behavior through the whole multi-agent task. In this study, we quantify the agent\u2019s behavior difference and build its relationship with the policy performance via {\\bf Role Diversity}, a metric to measure the characteristics of MARL tasks. We define role diversity from three perspectives: action-based, trajectory-based, and contribution-based to fully measure a multi-agent task. Through theoretical analysis, we find that the error bound in MARL can be decomposed into three parts that have a strong relation to the role diversity. The decomposed factors can significantly impact policy optimization in three popular directions including parameter sharing, communication mechanism, and credit assignment. The main experimental platforms are based on {\\bf Multiagent Particle Environment (MPE) }and {\\bf The StarCraft Multi-Agent Challenge (SMAC)}. Extensive experiments clearly show that role diversity can serve as a robust measurement for the characteristics of a multi-agent cooperation task and help diagnose whether the policy fits the current multi-agent system for better policy performance.", "bibtex": "@InProceedings{pmlr-v162-hu22c,\n title = \t {Policy Diagnosis via Measuring Role Diversity in Cooperative Multi-agent {RL}},\n author = {Hu, Siyi and Xie, Chuanlong and Liang, Xiaodan and Chang, Xiaojun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9041--9071},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hu22c/hu22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/hu22c.html},\n abstract = \t {Cooperative multi-agent reinforcement learning (MARL) is making rapid progress for solving tasks in a grid world and real-world scenarios, in which agents are given different attributes and goals, resulting in different behavior through the whole multi-agent task. In this study, we quantify the agent\u2019s behavior difference and build its relationship with the policy performance via {\\bf Role Diversity}, a metric to measure the characteristics of MARL tasks. We define role diversity from three perspectives: action-based, trajectory-based, and contribution-based to fully measure a multi-agent task. Through theoretical analysis, we find that the error bound in MARL can be decomposed into three parts that have a strong relation to the role diversity. The decomposed factors can significantly impact policy optimization in three popular directions including parameter sharing, communication mechanism, and credit assignment. The main experimental platforms are based on {\\bf Multiagent Particle Environment (MPE) }and {\\bf The StarCraft Multi-Agent Challenge (SMAC)}. Extensive experiments clearly show that role diversity can serve as a robust measurement for the characteristics of a multi-agent cooperation task and help diagnose whether the policy fits the current multi-agent system for better policy performance.}\n}", "pdf": "https://proceedings.mlr.press/v162/hu22c/hu22c.pdf", "supp": "", "pdf_size": 5590442, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13122313235205231044&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Monash University + The ReLER Lab, University of Technology Sydney; Beijing Normal University + Huawei Noah\u2019s Ark Lab; Sun Yat-sen University; The ReLER Lab, University of Technology Sydney", "aff_domain": "uts.edu.au; ; ;uts.edu.au", "email": "uts.edu.au; ; ;uts.edu.au", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/hu22c.html", "aff_unique_index": "0+1;2+3;4;1", "aff_unique_norm": "Monash University;University of Technology Sydney;Beijing Normal University;Huawei;Sun Yat-sen University", "aff_unique_dep": ";The ReLER Lab;;Noah\u2019s Ark Lab;", "aff_unique_url": "https://www.monash.edu;https://www.uts.edu.au;https://www.bnu.edu.cn;https://www.huawei.com;http://www.sysu.edu.cn/", "aff_unique_abbr": "Monash;UTS;BNU;Huawei;SYSU", "aff_campus_unique_index": "1;;1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0+0;1+1;1;0", "aff_country_unique": "Australia;China" }, { "title": "Policy Gradient Method For Robust Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16373", "id": "16373", "proceeding": "https://proceedings.mlr.press/v162/wang22at.html", "poster": "/media/PosterPDFs/ICML%202022/93db85ed909c13838ff95ccfa94cebd9.png?t=1657215280.467563", "slides": "/media/icml-2022/Slides/16373.pdf", "author_site": "Yue Wang, Shaofeng Zou", "author": "Yue Wang; Shaofeng Zou", "abstract": "This paper develops the first policy gradient method with global optimality guarantee and complexity analysis for robust reinforcement learning under model mismatch. Robust reinforcement learning is to learn a policy robust to model mismatch between simulator and real environment. We first develop the robust policy (sub-)gradient, which is applicable for any differentiable parametric policy class. We show that the proposed robust policy gradient method converges to the global optimum asymptotically under direct policy parameterization. We further develop a smoothed robust policy gradient method, and show that to achieve an $\\epsilon$-global optimum, the complexity is $\\mathcal O(\\epsilon^{-3})$. We then extend our methodology to the general model-free setting, and design the robust actor-critic method with differentiable parametric policy class and value function. We further characterize its asymptotic convergence and sample complexity under the tabular setting. Finally, we provide simulation results to demonstrate the robustness of our methods.", "bibtex": "@InProceedings{pmlr-v162-wang22at,\n title = \t {Policy Gradient Method For Robust Reinforcement Learning},\n author = {Wang, Yue and Zou, Shaofeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23484--23526},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22at/wang22at.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22at.html},\n abstract = \t {This paper develops the first policy gradient method with global optimality guarantee and complexity analysis for robust reinforcement learning under model mismatch. Robust reinforcement learning is to learn a policy robust to model mismatch between simulator and real environment. We first develop the robust policy (sub-)gradient, which is applicable for any differentiable parametric policy class. We show that the proposed robust policy gradient method converges to the global optimum asymptotically under direct policy parameterization. We further develop a smoothed robust policy gradient method, and show that to achieve an $\\epsilon$-global optimum, the complexity is $\\mathcal O(\\epsilon^{-3})$. We then extend our methodology to the general model-free setting, and design the robust actor-critic method with differentiable parametric policy class and value function. We further characterize its asymptotic convergence and sample complexity under the tabular setting. Finally, we provide simulation results to demonstrate the robustness of our methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22at/wang22at.pdf", "supp": "", "pdf_size": 2570220, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3649533470992686417&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Electrical Engineering, University at Buffalo, New York, USA; Department of Electrical Engineering, University at Buffalo, New York, USA", "aff_domain": "buffalo.edu;buffalo.edu", "email": "buffalo.edu;buffalo.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/wang22at.html", "aff_unique_index": "0;0", "aff_unique_norm": "University at Buffalo", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.buffalo.edu", "aff_unique_abbr": "UB", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Buffalo", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Popular decision tree algorithms are provably noise tolerant", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17493", "id": "17493", "proceeding": "https://proceedings.mlr.press/v162/blanc22b.html", "poster": "", "slides": "", "author_site": "Guy Blanc, Jane Lange, Ali Malik, Li-Yang Tan", "author": "Guy Blanc; Jane Lange; Ali Malik; Li-Yang Tan", "abstract": "Using the framework of boosting, we prove that all impurity-based decision tree learning algorithms, including the classic ID3, C4.5, and CART, are highly noise tolerant. Our guarantees hold under the strongest noise model of nasty noise, and we provide near-matching upper and lower bounds on the allowable noise rate. We further show that these algorithms, which are simple and have long been central to everyday machine learning, enjoy provable guarantees in the noisy setting that are unmatched by existing algorithms in the theoretical literature on decision tree learning. Taken together, our results add to an ongoing line of research that seeks to place the empirical success of these practical decision tree algorithms on firm theoretical footing.", "bibtex": "@InProceedings{pmlr-v162-blanc22b,\n title = \t {Popular decision tree algorithms are provably noise tolerant},\n author = {Blanc, Guy and Lange, Jane and Malik, Ali and Tan, Li-Yang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2091--2106},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/blanc22b/blanc22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/blanc22b.html},\n abstract = \t {Using the framework of boosting, we prove that all impurity-based decision tree learning algorithms, including the classic ID3, C4.5, and CART, are highly noise tolerant. Our guarantees hold under the strongest noise model of nasty noise, and we provide near-matching upper and lower bounds on the allowable noise rate. We further show that these algorithms, which are simple and have long been central to everyday machine learning, enjoy provable guarantees in the noisy setting that are unmatched by existing algorithms in the theoretical literature on decision tree learning. Taken together, our results add to an ongoing line of research that seeks to place the empirical success of these practical decision tree algorithms on firm theoretical footing.}\n}", "pdf": "https://proceedings.mlr.press/v162/blanc22b/blanc22b.pdf", "supp": "", "pdf_size": 368181, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16667303393886822373&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, Stanford University; Department of Computer Science, Massachusetts Institute of Technology; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University", "aff_domain": "stanford.edu;mit.edu;cs.stanford.edu;cs.stanford.edu", "email": "stanford.edu;mit.edu;cs.stanford.edu;cs.stanford.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/blanc22b.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Stanford University;Massachusetts Institute of Technology", "aff_unique_dep": "Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.stanford.edu;https://web.mit.edu", "aff_unique_abbr": "Stanford;MIT", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Stanford;Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Position Prediction as an Effective Pretraining Strategy", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18267", "id": "18267", "proceeding": "https://proceedings.mlr.press/v162/zhai22a.html", "poster": "/media/PosterPDFs/ICML%202022/956685427c5cd9dcb04f784272727336.png?t=1658168437.1601236", "slides": "", "author_site": "Shuangfei Zhai, Navdeep Jaitly, Jason Ramapuram, Dan Busbridge, Tatiana Likhomanenko, Joseph Cheng, Walter Talbott, Chen Huang, Hanlin Goh, Joshua M Susskind", "author": "Shuangfei Zhai; Navdeep Jaitly; Jason Ramapuram; Dan Busbridge; Tatiana Likhomanenko; Joseph Y Cheng; Walter Talbott; Chen Huang; Hanlin Goh; Joshua M Susskind", "abstract": "Transformers \\cite{transformer} have gained increasing popularity in a wide range of applications, including Natural Language Processing (NLP), Computer Vision and Speech Recognition, because of their powerful representational capacity. However, harnessing this representational capacity effectively requires a large amount of data, strong regularization, or both, to mitigate overfitting. Recently, the power of the Transformer has been unlocked by self-supervised pretraining strategies based on masked autoencoders which rely on reconstructing masked inputs, directly, or contrastively from unmasked content. This pretraining strategy which has been used in BERT models in NLP \\cite{bert}, Wav2Vec models in Speech \\cite{wv2v2} and, recently, in MAE models in Vision \\cite{beit, mae}, forces the model to learn about relationships between the content in different parts of the input using autoencoding related objectives. In this paper, we propose a novel, but surprisingly simple alternative to content reconstruction\u00a0\u2013 that of predicting locations from content, without providing positional information for it. Doing so requires the Transformer to understand the positional relationships between different parts of the input, from their content alone. This amounts to an efficient implementation where the pretext task is a classification problem among all possible positions for each input token. We experiment on both Vision and Speech benchmarks, where our approach brings improvements over strong supervised training baselines and is comparable to modern unsupervised/self-supervised pretraining methods. Our method also enables Transformers trained without position embeddings to outperform ones trained with full position information.", "bibtex": "@InProceedings{pmlr-v162-zhai22a,\n title = \t {Position Prediction as an Effective Pretraining Strategy},\n author = {Zhai, Shuangfei and Jaitly, Navdeep and Ramapuram, Jason and Busbridge, Dan and Likhomanenko, Tatiana and Cheng, Joseph Y and Talbott, Walter and Huang, Chen and Goh, Hanlin and Susskind, Joshua M},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26010--26027},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhai22a/zhai22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhai22a.html},\n abstract = \t {Transformers \\cite{transformer} have gained increasing popularity in a wide range of applications, including Natural Language Processing (NLP), Computer Vision and Speech Recognition, because of their powerful representational capacity. However, harnessing this representational capacity effectively requires a large amount of data, strong regularization, or both, to mitigate overfitting. Recently, the power of the Transformer has been unlocked by self-supervised pretraining strategies based on masked autoencoders which rely on reconstructing masked inputs, directly, or contrastively from unmasked content. This pretraining strategy which has been used in BERT models in NLP \\cite{bert}, Wav2Vec models in Speech \\cite{wv2v2} and, recently, in MAE models in Vision \\cite{beit, mae}, forces the model to learn about relationships between the content in different parts of the input using autoencoding related objectives. In this paper, we propose a novel, but surprisingly simple alternative to content reconstruction\u00a0\u2013 that of predicting locations from content, without providing positional information for it. Doing so requires the Transformer to understand the positional relationships between different parts of the input, from their content alone. This amounts to an efficient implementation where the pretext task is a classification problem among all possible positions for each input token. We experiment on both Vision and Speech benchmarks, where our approach brings improvements over strong supervised training baselines and is comparable to modern unsupervised/self-supervised pretraining methods. Our method also enables Transformers trained without position embeddings to outperform ones trained with full position information.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhai22a/zhai22a.pdf", "supp": "", "pdf_size": 19078878, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3214416996859791607&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "email": ";;;;;;;;;", "github": "", "project": "", "author_num": 10, "oa": "https://proceedings.mlr.press/v162/zhai22a.html" }, { "title": "Power-Law Escape Rate of SGD", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16519", "id": "16519", "proceeding": "https://proceedings.mlr.press/v162/mori22a.html", "poster": "/media/PosterPDFs/ICML%202022/f52854cc99ae1c1966b0a21d0127975b.png?t=1657550468.6584225", "slides": "/media/icml-2022/Slides/16519.pdf", "author_site": "Takashi Mori, Liu Ziyin, Kangqiao Liu, Masahito Ueda", "author": "Takashi Mori; Liu Ziyin; Kangqiao Liu; Masahito Ueda", "abstract": "Stochastic gradient descent (SGD) undergoes complicated multiplicative noise for the mean-square loss. We use this property of SGD noise to derive a stochastic differential equation (SDE) with simpler additive noise by performing a random time change. Using this formalism, we show that the log loss barrier $\\Delta\\log L=\\log[L(\\theta^s)/L(\\theta^*)]$ between a local minimum $\\theta^*$ and a saddle $\\theta^s$ determines the escape rate of SGD from the local minimum, contrary to the previous results borrowing from physics that the linear loss barrier $\\Delta L=L(\\theta^s)-L(\\theta^*)$ decides the escape rate. Our escape-rate formula strongly depends on the typical magnitude $h^*$ and the number $n$ of the outlier eigenvalues of the Hessian. This result explains an empirical fact that SGD prefers flat minima with low effective dimensions, giving an insight into implicit biases of SGD.", "bibtex": "@InProceedings{pmlr-v162-mori22a,\n title = \t {Power-Law Escape Rate of {SGD}},\n author = {Mori, Takashi and Ziyin, Liu and Liu, Kangqiao and Ueda, Masahito},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15959--15975},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mori22a/mori22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mori22a.html},\n abstract = \t {Stochastic gradient descent (SGD) undergoes complicated multiplicative noise for the mean-square loss. We use this property of SGD noise to derive a stochastic differential equation (SDE) with simpler additive noise by performing a random time change. Using this formalism, we show that the log loss barrier $\\Delta\\log L=\\log[L(\\theta^s)/L(\\theta^*)]$ between a local minimum $\\theta^*$ and a saddle $\\theta^s$ determines the escape rate of SGD from the local minimum, contrary to the previous results borrowing from physics that the linear loss barrier $\\Delta L=L(\\theta^s)-L(\\theta^*)$ decides the escape rate. Our escape-rate formula strongly depends on the typical magnitude $h^*$ and the number $n$ of the outlier eigenvalues of the Hessian. This result explains an empirical fact that SGD prefers flat minima with low effective dimensions, giving an insight into implicit biases of SGD.}\n}", "pdf": "https://proceedings.mlr.press/v162/mori22a/mori22a.pdf", "supp": "", "pdf_size": 1385377, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5923050307070035195&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Center for Emergent Matter Science, Riken, Saitama, Japan + Department of Physics, The University of Tokyo, Tokyo, Japan + Institute for Physics of Intelligence, The University of Tokyo, Tokyo, Japan; Department of Physics, The University of Tokyo, Tokyo, Japan + Institute for Physics of Intelligence, The University of Tokyo, Tokyo, Japan; Department of Physics, The University of Tokyo, Tokyo, Japan + Institute for Physics of Intelligence, The University of Tokyo, Tokyo, Japan; Center for Emergent Matter Science, Riken, Saitama, Japan + Department of Physics, The University of Tokyo, Tokyo, Japan + Institute for Physics of Intelligence, The University of Tokyo, Tokyo, Japan", "aff_domain": "riken.jp; ; ;", "email": "riken.jp; ; ;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/mori22a.html", "aff_unique_index": "0+1+1;1+1;1+1;0+1+1", "aff_unique_norm": "RIKEN;University of Tokyo", "aff_unique_dep": "Center for Emergent Matter Science;Department of Physics", "aff_unique_url": "https://www.riken.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "Riken;UTokyo", "aff_campus_unique_index": "0+1+1;1+1;1+1;0+1+1", "aff_campus_unique": "Saitama;Tokyo", "aff_country_unique_index": "0+0+0;0+0;0+0;0+0+0", "aff_country_unique": "Japan" }, { "title": "Practical Almost-Linear-Time Approximation Algorithms for Hybrid and Overlapping Graph Clustering", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16879", "id": "16879", "proceeding": "https://proceedings.mlr.press/v162/orecchia22a.html", "poster": "/media/PosterPDFs/ICML%202022/02ed812220b0705fabb868ddbf17ea20.png?t=1657400525.0224087", "slides": "", "author_site": "Lorenzo Orecchia, Konstantinos Ameranis, Charalampos Tsourakakis, Kunal Talwar", "author": "Lorenzo Orecchia; Konstantinos Ameranis; Charalampos Tsourakakis; Kunal Talwar", "abstract": "Detecting communities in real-world networks and clustering similarity graphs are major data mining tasks with a wide range of applications in graph mining, collaborative filtering, and bioinformatics. In many such applications, overwhelming empirical evidence suggests that communities and clusters are naturally overlapping, i.e., the boundary of a cluster may contain both edges across clusters and nodes that are shared with other clusters, calling for novel hybrid graph partitioning algorithms (HGP). While almost-linear-time approximation algorithms are known for edge-boundary-based graph partitioning, little progress has been made on fast algorithms for HGP, even in the special case of vertex-boundary-based graph partitioning. In this work, we introduce a frame-work based on two novel clustering objectives, which naturally extend the well-studied notion of conductance to clusters with hybrid vertex-and edge-boundary structure. Our main algorithmic contributions are almost-linear-time algorithms O(log n)-approximation algorithms for both these objectives. To this end, we show that the cut-matching framework of (Khandekar et al., 2014) can be significantly extended to incorporate hybrid partitions. Crucially, we implement our approximation algorithm to produce both hybrid partitions and optimality certificates for large graphs, easily scaling to tens of millions of edges, and test our implementation on real-world datasets against other competitive baselines.", "bibtex": "@InProceedings{pmlr-v162-orecchia22a,\n title = \t {Practical Almost-Linear-Time Approximation Algorithms for Hybrid and Overlapping Graph Clustering},\n author = {Orecchia, Lorenzo and Ameranis, Konstantinos and Tsourakakis, Charalampos and Talwar, Kunal},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17071--17093},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/orecchia22a/orecchia22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/orecchia22a.html},\n abstract = \t {Detecting communities in real-world networks and clustering similarity graphs are major data mining tasks with a wide range of applications in graph mining, collaborative filtering, and bioinformatics. In many such applications, overwhelming empirical evidence suggests that communities and clusters are naturally overlapping, i.e., the boundary of a cluster may contain both edges across clusters and nodes that are shared with other clusters, calling for novel hybrid graph partitioning algorithms (HGP). While almost-linear-time approximation algorithms are known for edge-boundary-based graph partitioning, little progress has been made on fast algorithms for HGP, even in the special case of vertex-boundary-based graph partitioning. In this work, we introduce a frame-work based on two novel clustering objectives, which naturally extend the well-studied notion of conductance to clusters with hybrid vertex-and edge-boundary structure. Our main algorithmic contributions are almost-linear-time algorithms O(log n)-approximation algorithms for both these objectives. To this end, we show that the cut-matching framework of (Khandekar et al., 2014) can be significantly extended to incorporate hybrid partitions. Crucially, we implement our approximation algorithm to produce both hybrid partitions and optimality certificates for large graphs, easily scaling to tens of millions of edges, and test our implementation on real-world datasets against other competitive baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/orecchia22a/orecchia22a.pdf", "supp": "", "pdf_size": 2490977, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15803407805561735985&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, University of Chicago, Chicago, USA; Department of Computer Science, University of Chicago, Chicago, USA; Apple Inc; Department of Computer Science, Boston University, Boston, USA", "aff_domain": "uchicago.edu;uchicago.edu; ; ", "email": "uchicago.edu;uchicago.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/orecchia22a.html", "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of Chicago;Apple;Boston University", "aff_unique_dep": "Department of Computer Science;Apple Inc;Department of Computer Science", "aff_unique_url": "https://www.uchicago.edu;https://www.apple.com;https://www.bu.edu", "aff_unique_abbr": "UChicago;Apple;BU", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Chicago;;Boston", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Preconditioning for Scalable Gaussian Process Hyperparameter Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16533", "id": "16533", "proceeding": "https://proceedings.mlr.press/v162/wenger22a.html", "poster": "/media/PosterPDFs/ICML%202022/c5ff2543b53f4cc0ad3819a36752467b.png?t=1657768153.907864", "slides": "", "author_site": "Jonathan Wenger, Geoff Pleiss, Philipp Hennig, John Cunningham, Jacob Gardner", "author": "Jonathan Wenger; Geoff Pleiss; Philipp Hennig; John Cunningham; Jacob Gardner", "abstract": "Gaussian process hyperparameter optimization requires linear solves with, and log-determinants of, large kernel matrices. Iterative numerical techniques are becoming popular to scale to larger datasets, relying on the conjugate gradient method (CG) for the linear solves and stochastic trace estimation for the log-determinant. This work introduces new algorithmic and theoretical insights for preconditioning these computations. While preconditioning is well understood in the context of CG, we demonstrate that it can also accelerate convergence and reduce variance of the estimates for the log-determinant and its derivative. We prove general probabilistic error bounds for the preconditioned computation of the log-determinant, log-marginal likelihood and its derivatives. Additionally, we derive specific rates for a range of kernel-preconditioner combinations, showing that up to exponential convergence can be achieved. Our theoretical results enable provably efficient optimization of kernel hyperparameters, which we validate empirically on large-scale benchmark problems. There our approach accelerates training by up to an order of magnitude.", "bibtex": "@InProceedings{pmlr-v162-wenger22a,\n title = \t {Preconditioning for Scalable {G}aussian Process Hyperparameter Optimization},\n author = {Wenger, Jonathan and Pleiss, Geoff and Hennig, Philipp and Cunningham, John and Gardner, Jacob},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23751--23780},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wenger22a/wenger22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/wenger22a.html},\n abstract = \t {Gaussian process hyperparameter optimization requires linear solves with, and log-determinants of, large kernel matrices. Iterative numerical techniques are becoming popular to scale to larger datasets, relying on the conjugate gradient method (CG) for the linear solves and stochastic trace estimation for the log-determinant. This work introduces new algorithmic and theoretical insights for preconditioning these computations. While preconditioning is well understood in the context of CG, we demonstrate that it can also accelerate convergence and reduce variance of the estimates for the log-determinant and its derivative. We prove general probabilistic error bounds for the preconditioned computation of the log-determinant, log-marginal likelihood and its derivatives. Additionally, we derive specific rates for a range of kernel-preconditioner combinations, showing that up to exponential convergence can be achieved. Our theoretical results enable provably efficient optimization of kernel hyperparameters, which we validate empirically on large-scale benchmark problems. There our approach accelerates training by up to an order of magnitude.}\n}", "pdf": "https://proceedings.mlr.press/v162/wenger22a/wenger22a.pdf", "supp": "", "pdf_size": 874549, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5186078999391595053&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "aff": "University of T\u00fcbingen+Max Planck Institute for Intelligent Systems, T\u00fcbingen; Columbia University; University of T\u00fcbingen+Max Planck Institute for Intelligent Systems, T\u00fcbingen; Columbia University; University of Pennsylvania", "aff_domain": "uni-tuebingen.de; ; ; ; ", "email": "uni-tuebingen.de; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/wenger22a.html", "aff_unique_index": "0+1;2;0+1;2;3", "aff_unique_norm": "University of T\u00fcbingen;Max Planck Institute for Intelligent Systems;Columbia University;University of Pennsylvania", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.mpi-is.mpg.de;https://www.columbia.edu;https://www.upenn.edu", "aff_unique_abbr": "Uni T\u00fcbingen;MPI-IS;Columbia;UPenn", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";T\u00fcbingen", "aff_country_unique_index": "0+0;1;0+0;1;1", "aff_country_unique": "Germany;United States" }, { "title": "Predicting Out-of-Distribution Error with the Projection Norm", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17655", "id": "17655", "proceeding": "https://proceedings.mlr.press/v162/yu22i.html", "poster": "/media/PosterPDFs/ICML%202022/39461a19e9eddfb385ea76b26521ea48.png?t=1657739740.6020148", "slides": "", "author_site": "Yaodong Yu, Zitong Yang, Alexander Wei, Yi Ma, Jacob Steinhardt", "author": "Yaodong Yu; Zitong Yang; Alexander Wei; Yi Ma; Jacob Steinhardt", "abstract": "We propose a metric\u2014", "bibtex": "@InProceedings{pmlr-v162-yu22i,\n title = \t {Predicting Out-of-Distribution Error with the Projection Norm},\n author = {Yu, Yaodong and Yang, Zitong and Wei, Alexander and Ma, Yi and Steinhardt, Jacob},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25721--25746},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yu22i/yu22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/yu22i.html},\n abstract = \t {We propose a metric\u2014", "pdf": "https://proceedings.mlr.press/v162/yu22i/yu22i.pdf", "supp": "", "pdf_size": 3293779, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14580458746203726066&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "University of California, Berkeley; University of California, Berkeley; University of California, Berkeley; University of California, Berkeley; University of California, Berkeley", "aff_domain": "eecs.berkeley.edu;berkeley.edu; ; ; ", "email": "eecs.berkeley.edu;berkeley.edu; ; ; ", "github": "https://github.com/yaodongyu/ProjNorm", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/yu22i.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Principal Component Flows", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16931", "id": "16931", "proceeding": "https://proceedings.mlr.press/v162/cunningham22a.html", "poster": "", "slides": "", "author_site": "Edmond Cunningham, Adam Cobb, Susmit Jha", "author": "Edmond Cunningham; Adam D Cobb; Susmit Jha", "abstract": "Normalizing flows map an independent set of latent variables to their samples using a bijective transformation. Despite the exact correspondence between samples and latent variables, their high level relationship is not well understood. In this paper we characterize the geometric structure of flows using principal manifolds and understand the relationship between latent variables and samples using contours. We introduce a novel class of normalizing flows, called principal component flows (PCF), whose contours are its principal manifolds, and a variant for injective flows (iPCF) that is more efficient to train than regular injective flows. PCFs can be constructed using any flow architecture, are trained with a regularized maximum likelihood objective and can perform density estimation on all of their principal manifolds. In our experiments we show that PCFs and iPCFs are able to learn the principal manifolds over a variety of datasets. Additionally, we show that PCFs can perform density estimation on data that lie on a manifold with variable dimensionality, which is not possible with existing normalizing flows.", "bibtex": "@InProceedings{pmlr-v162-cunningham22a,\n title = \t {Principal Component Flows},\n author = {Cunningham, Edmond and Cobb, Adam D and Jha, Susmit},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4492--4519},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cunningham22a/cunningham22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/cunningham22a.html},\n abstract = \t {Normalizing flows map an independent set of latent variables to their samples using a bijective transformation. Despite the exact correspondence between samples and latent variables, their high level relationship is not well understood. In this paper we characterize the geometric structure of flows using principal manifolds and understand the relationship between latent variables and samples using contours. We introduce a novel class of normalizing flows, called principal component flows (PCF), whose contours are its principal manifolds, and a variant for injective flows (iPCF) that is more efficient to train than regular injective flows. PCFs can be constructed using any flow architecture, are trained with a regularized maximum likelihood objective and can perform density estimation on all of their principal manifolds. In our experiments we show that PCFs and iPCFs are able to learn the principal manifolds over a variety of datasets. Additionally, we show that PCFs can perform density estimation on data that lie on a manifold with variable dimensionality, which is not possible with existing normalizing flows.}\n}", "pdf": "https://proceedings.mlr.press/v162/cunningham22a/cunningham22a.pdf", "supp": "", "pdf_size": 2239660, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11692571994890153706&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Massachusetts + SRI International; SRI International; SRI International", "aff_domain": "cs.umass.edu; ; ", "email": "cs.umass.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/cunningham22a.html", "aff_unique_index": "0+1;1;1", "aff_unique_norm": "University of Massachusetts;SRI International", "aff_unique_dep": ";", "aff_unique_url": "https://www.umass.edu;https://www.sri.com", "aff_unique_abbr": "UMass;SRI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "Principled Knowledge Extrapolation with GANs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17229", "id": "17229", "proceeding": "https://proceedings.mlr.press/v162/feng22b.html", "poster": "/media/PosterPDFs/ICML%202022/3cec07e9ba5f5bb252d13f5f431e4bbb.png?t=1658125267.462012", "slides": "", "author_site": "Ruili Feng, Jie Xiao, Kecheng Zheng, Deli Zhao, Jingren Zhou, Qibin Sun, Zheng-Jun Zha", "author": "Ruili Feng; Jie Xiao; Kecheng Zheng; Deli Zhao; Jingren Zhou; Qibin Sun; Zheng-Jun Zha", "abstract": "Human can extrapolate well, generalize daily knowledge into unseen scenarios, raise and answer counterfactual questions. To imitate this ability via generative models, previous works have extensively studied explicitly encoding Structural Causal Models (SCMs) into architectures of generator networks. This methodology, however, limits the flexibility of the generator as they must be carefully crafted to follow the causal graph, and demands a ground truth SCM with strong ignorability assumption as prior, which is a nontrivial assumption in many real scenarios. Thus, many current causal GAN methods fail to generate high fidelity counterfactual results as they cannot easily leverage state-of-the-art generative models. In this paper, we propose to study counterfactual synthesis from a new perspective of knowledge extrapolation, where a given knowledge dimension of the data distribution is extrapolated, but the remaining knowledge is kept indistinguishable from the original distribution. We show that an adversarial game with a closed-form discriminator can be used to address the knowledge extrapolation problem, and a novel principal knowledge descent method can efficiently estimate the extrapolated distribution through the adversarial game. Our method enjoys both elegant theoretical guarantees and superior performance in many scenarios.", "bibtex": "@InProceedings{pmlr-v162-feng22b,\n title = \t {Principled Knowledge Extrapolation with {GAN}s},\n author = {Feng, Ruili and Xiao, Jie and Zheng, Kecheng and Zhao, Deli and Zhou, Jingren and Sun, Qibin and Zha, Zheng-Jun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6447--6464},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/feng22b/feng22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/feng22b.html},\n abstract = \t {Human can extrapolate well, generalize daily knowledge into unseen scenarios, raise and answer counterfactual questions. To imitate this ability via generative models, previous works have extensively studied explicitly encoding Structural Causal Models (SCMs) into architectures of generator networks. This methodology, however, limits the flexibility of the generator as they must be carefully crafted to follow the causal graph, and demands a ground truth SCM with strong ignorability assumption as prior, which is a nontrivial assumption in many real scenarios. Thus, many current causal GAN methods fail to generate high fidelity counterfactual results as they cannot easily leverage state-of-the-art generative models. In this paper, we propose to study counterfactual synthesis from a new perspective of knowledge extrapolation, where a given knowledge dimension of the data distribution is extrapolated, but the remaining knowledge is kept indistinguishable from the original distribution. We show that an adversarial game with a closed-form discriminator can be used to address the knowledge extrapolation problem, and a novel principal knowledge descent method can efficiently estimate the extrapolated distribution through the adversarial game. Our method enjoys both elegant theoretical guarantees and superior performance in many scenarios.}\n}", "pdf": "https://proceedings.mlr.press/v162/feng22b/feng22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/feng22b-supp.zip", "pdf_size": 42886247, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16977525841987501691&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of Science and Technology of China; University of Science and Technology of China; University of Science and Technology of China; Ant Research; Alibaba Group; University of Science and Technology of China; University of Science and Technology of China", "aff_domain": "ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;zju.edu.cn;alibaba-inc.com;ustc.edu.cn;ustc.edu.cn", "email": "ustc.edu.cn;ustc.edu.cn;ustc.edu.cn;zju.edu.cn;alibaba-inc.com;ustc.edu.cn;ustc.edu.cn", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/feng22b.html", "aff_unique_index": "0;0;0;1;2;0;0", "aff_unique_norm": "University of Science and Technology of China;Ant Research;Alibaba Group", "aff_unique_dep": ";;", "aff_unique_url": "http://www.ustc.edu.cn;https://www.antgroup.com;https://www.alibaba.com", "aff_unique_abbr": "USTC;Ant Research;Alibaba", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Prioritized Training on Points that are Learnable, Worth Learning, and not yet Learnt", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18419", "id": "18419", "proceeding": "https://proceedings.mlr.press/v162/mindermann22a.html", "poster": "/media/PosterPDFs/ICML%202022/9823c9091899dc6f227f9133ba2dfd30_XWG6iPL.png?t=1658090620.4648583", "slides": "/media/icml-2022/Slides/18419.pdf", "author_site": "S\u00f6ren Mindermann, Jan Brauner, Muhammed Razzak, Mrinank Sharma, Andreas Kirsch, Winnie Xu, Benedikt H\u00f6ltgen, Aidan Gomez, Adrien Morisot, Sebastian Farquhar, Yarin Gal", "author": "S\u00f6ren Mindermann; Jan M Brauner; Muhammed T Razzak; Mrinank Sharma; Andreas Kirsch; Winnie Xu; Benedikt H\u00f6ltgen; Aidan N Gomez; Adrien Morisot; Sebastian Farquhar; Yarin Gal", "abstract": "Training on web-scale data can take months. But much computation and time is wasted on redundant and noisy points that are already learnt or not learnable. To accelerate training, we introduce Reducible Holdout Loss Selection (RHO-LOSS), a simple but principled technique which selects approximately those points for training that most reduce the model\u2019s generalization loss. As a result, RHO-LOSS mitigates the weaknesses of existing data selection methods: techniques from the optimization literature typically select \"hard\" (e.g. high loss) points, but such points are often noisy (not learnable) or less task-relevant. Conversely, curriculum learning prioritizes \"easy\" points, but such points need not be trained on once learned. In contrast, RHO-LOSS selects points that are learnable, worth learning, and not yet learnt. RHO-LOSS trains in far fewer steps than prior art, improves accuracy, and speeds up training on a wide range of datasets, hyperparameters, and architectures (MLPs, CNNs, and BERT). On the large web-scraped image dataset Clothing-1M, RHO-LOSS trains in 18x fewer steps and reaches 2% higher final accuracy than uniform data shuffling.", "bibtex": "@InProceedings{pmlr-v162-mindermann22a,\n title = \t {Prioritized Training on Points that are Learnable, Worth Learning, and not yet Learnt},\n author = {Mindermann, S{\\\"o}ren and Brauner, Jan M and Razzak, Muhammed T and Sharma, Mrinank and Kirsch, Andreas and Xu, Winnie and H{\\\"o}ltgen, Benedikt and Gomez, Aidan N and Morisot, Adrien and Farquhar, Sebastian and Gal, Yarin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15630--15649},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mindermann22a/mindermann22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mindermann22a.html},\n abstract = \t {Training on web-scale data can take months. But much computation and time is wasted on redundant and noisy points that are already learnt or not learnable. To accelerate training, we introduce Reducible Holdout Loss Selection (RHO-LOSS), a simple but principled technique which selects approximately those points for training that most reduce the model\u2019s generalization loss. As a result, RHO-LOSS mitigates the weaknesses of existing data selection methods: techniques from the optimization literature typically select \"hard\" (e.g. high loss) points, but such points are often noisy (not learnable) or less task-relevant. Conversely, curriculum learning prioritizes \"easy\" points, but such points need not be trained on once learned. In contrast, RHO-LOSS selects points that are learnable, worth learning, and not yet learnt. RHO-LOSS trains in far fewer steps than prior art, improves accuracy, and speeds up training on a wide range of datasets, hyperparameters, and architectures (MLPs, CNNs, and BERT). On the large web-scraped image dataset Clothing-1M, RHO-LOSS trains in 18x fewer steps and reaches 2% higher final accuracy than uniform data shuffling.}\n}", "pdf": "https://proceedings.mlr.press/v162/mindermann22a/mindermann22a.pdf", "supp": "", "pdf_size": 1428379, "gs_citation": 173, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5784378723216835078&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "OATML, Department of Computer Science, University of Oxford; OATML, Department of Computer Science, University of Oxford; OATML, Department of Computer Science, University of Oxford; Department of Statistics, University of Oxford; OATML, Department of Computer Science, University of Oxford; Cohere.ai+University of Toronto, performed at Cohere.ai; OATML, Department of Computer Science, University of Oxford; Cohere.ai+OATML, Department of Computer Science, University of Oxford; Cohere.ai; OATML, Department of Computer Science, University of Oxford; OATML, Department of Computer Science, University of Oxford", "aff_domain": "gmail.com; ; ; ; ; ; ; ; ; ;", "email": "gmail.com; ; ; ; ; ; ; ; ; ;", "github": "https://github.com/OATML/RHO-Loss", "project": "", "author_num": 11, "oa": "https://proceedings.mlr.press/v162/mindermann22a.html", "aff_unique_index": "0;0;0;0;0;1+2;0;1+0;1;0;0", "aff_unique_norm": "University of Oxford;Cohere;University of Toronto", "aff_unique_dep": "Department of Computer Science;;", "aff_unique_url": "https://www.ox.ac.uk;https://cohere.ai;https://www.utoronto.ca", "aff_unique_abbr": "Oxford;Cohere;U of T", "aff_campus_unique_index": "0;0;0;0;0;;0;0;0;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;0;0;0;0;1+2;0;1+0;1;0;0", "aff_country_unique": "United Kingdom;United States;Canada" }, { "title": "Privacy for Free: How does Dataset Condensation Help Privacy?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18235", "id": "18235", "proceeding": "https://proceedings.mlr.press/v162/dong22c.html", "poster": "/media/PosterPDFs/ICML%202022/a3c788c57e423fa9c177544a4d5d1239.png?t=1657427240.4621756", "slides": "", "author_site": "Tian Dong, Bo Zhao, Lingjuan Lyu", "author": "Tian Dong; Bo Zhao; Lingjuan Lyu", "abstract": "To prevent unintentional data leakage, research community has resorted to data generators that can produce differentially private data for model training. However, for the sake of the data privacy, existing solutions suffer from either expensive training cost or poor generalization performance. Therefore, we raise the question whether training efficiency and privacy can be achieved simultaneously. In this work, we for the first time identify that dataset condensation (DC) which is originally designed for improving training efficiency is also a better solution to replace the traditional data generators for private data generation, thus providing privacy for free. To demonstrate the privacy benefit of DC, we build a connection between DC and differential privacy, and theoretically prove on linear feature extractors (and then extended to non-linear feature extractors) that the existence of one sample has limited impact ($O(m/n)$) on the parameter distribution of networks trained on $m$ samples synthesized from $n (n \\gg m)$ raw samples by DC. We also empirically validate the visual privacy and membership privacy of DC-synthesized data by launching both the loss-based and the state-of-the-art likelihood-based membership inference attacks. We envision this work as a milestone for data-efficient and privacy-preserving machine learning.", "bibtex": "@InProceedings{pmlr-v162-dong22c,\n title = \t {Privacy for Free: How does Dataset Condensation Help Privacy?},\n author = {Dong, Tian and Zhao, Bo and Lyu, Lingjuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5378--5396},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dong22c/dong22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/dong22c.html},\n abstract = \t {To prevent unintentional data leakage, research community has resorted to data generators that can produce differentially private data for model training. However, for the sake of the data privacy, existing solutions suffer from either expensive training cost or poor generalization performance. Therefore, we raise the question whether training efficiency and privacy can be achieved simultaneously. In this work, we for the first time identify that dataset condensation (DC) which is originally designed for improving training efficiency is also a better solution to replace the traditional data generators for private data generation, thus providing privacy for free. To demonstrate the privacy benefit of DC, we build a connection between DC and differential privacy, and theoretically prove on linear feature extractors (and then extended to non-linear feature extractors) that the existence of one sample has limited impact ($O(m/n)$) on the parameter distribution of networks trained on $m$ samples synthesized from $n (n \\gg m)$ raw samples by DC. We also empirically validate the visual privacy and membership privacy of DC-synthesized data by launching both the loss-based and the state-of-the-art likelihood-based membership inference attacks. We envision this work as a milestone for data-efficient and privacy-preserving machine learning.}\n}", "pdf": "https://proceedings.mlr.press/v162/dong22c/dong22c.pdf", "supp": "", "pdf_size": 2272442, "gs_citation": 151, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4558058824936525095&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science and Engineering, Shanghai Jiao Tong University; School of Informatics, The University of Edinburgh; Sony AI", "aff_domain": "sony.com; ; ", "email": "sony.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/dong22c.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Shanghai Jiao Tong University;University of Edinburgh;Sony", "aff_unique_dep": "Department of Computer Science and Engineering;School of Informatics;Sony AI", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.ed.ac.uk;https://www.sony.com", "aff_unique_abbr": "SJTU;Edinburgh;Sony AI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Edinburgh", "aff_country_unique_index": "0;1;2", "aff_country_unique": "China;United Kingdom;Japan" }, { "title": "Private Adaptive Optimization with Side information", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17541", "id": "17541", "proceeding": "https://proceedings.mlr.press/v162/li22x.html", "poster": "/media/PosterPDFs/ICML%202022/2be5f9c2e3620eb73c2972d7552b6cb5.png?t=1657598252.2216218", "slides": "/media/icml-2022/Slides/17541.pdf", "author_site": "Tian Li, Manzil Zaheer, Sashank Jakkam Reddi, Virginia Smith", "author": "Tian Li; Manzil Zaheer; Sashank Reddi; Virginia Smith", "abstract": "Adaptive optimization methods have become the default solvers for many machine learning tasks. Unfortunately, the benefits of adaptivity may degrade when training with differential privacy, as the noise added to ensure privacy reduces the effectiveness of the adaptive preconditioner. To this end, we propose AdaDPS, a general framework that uses non-sensitive side information to precondition the gradients, allowing the effective use of adaptive methods in private settings. We formally show AdaDPS reduces the amount of noise needed to achieve similar privacy guarantees, thereby improving optimization performance. Empirically, we leverage simple and readily available side information to explore the performance of AdaDPS in practice, comparing to strong baselines in both centralized and federated settings. Our results show that AdaDPS improves accuracy by 7.7% (absolute) on average\u2014yielding state-of-the-art privacy-utility trade-offs on large-scale text and image benchmarks.", "bibtex": "@InProceedings{pmlr-v162-li22x,\n title = \t {Private Adaptive Optimization with Side information},\n author = {Li, Tian and Zaheer, Manzil and Reddi, Sashank and Smith, Virginia},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13086--13105},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22x/li22x.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22x.html},\n abstract = \t {Adaptive optimization methods have become the default solvers for many machine learning tasks. Unfortunately, the benefits of adaptivity may degrade when training with differential privacy, as the noise added to ensure privacy reduces the effectiveness of the adaptive preconditioner. To this end, we propose AdaDPS, a general framework that uses non-sensitive side information to precondition the gradients, allowing the effective use of adaptive methods in private settings. We formally show AdaDPS reduces the amount of noise needed to achieve similar privacy guarantees, thereby improving optimization performance. Empirically, we leverage simple and readily available side information to explore the performance of AdaDPS in practice, comparing to strong baselines in both centralized and federated settings. Our results show that AdaDPS improves accuracy by 7.7% (absolute) on average\u2014yielding state-of-the-art privacy-utility trade-offs on large-scale text and image benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22x/li22x.pdf", "supp": "", "pdf_size": 615356, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15603924695620252408&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Carnegie Mellon University; Google DeepMind; Google Research; Carnegie Mellon University", "aff_domain": "cmu.edu; ; ; ", "email": "cmu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/li22x.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Carnegie Mellon University;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.cmu.edu;https://deepmind.com", "aff_unique_abbr": "CMU;DeepMind", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Private Streaming SCO in $\\ell_p$ geometry with Applications in High Dimensional Online Decision Making", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18263", "id": "18263", "proceeding": "https://proceedings.mlr.press/v162/han22d.html", "poster": "/media/PosterPDFs/ICML%202022/c45008212f7bdf6eab6050c2a564435a.png?t=1656679689.6720874", "slides": "", "author_site": "Yuxuan Han, Zhicong Liang, Zhipeng Liang, Yang Wang, Yuan Yao, Jiheng Zhang", "author": "Yuxuan Han; Zhicong Liang; Zhipeng Liang; Yang Wang; Yuan Yao; Jiheng Zhang", "abstract": "Differentially private (DP) stochastic convex optimization (SCO) is ubiquitous in trustworthy machine learning algorithm design. This paper studies the DP-SCO problem with streaming data sampled from a distribution and arrives sequentially. We also consider the continual release model where parameters related to private information are updated and released upon each new data. Numerous algorithms have been developed to achieve optimal excess risks in different $\\ell_p$ norm geometries, but none of the existing ones can be adapted to the streaming and continual release setting. We propose a private variant of the Frank-Wolfe algorithm with recursive gradients for variance reduction to update and reveal the parameters upon each data. Combined with the adaptive DP analysis, our algorithm achieves the first optimal excess risk in linear time in the case $1", "bibtex": "@InProceedings{pmlr-v162-han22d,\n title = \t {Private Streaming {SCO} in $\\ell_p$ geometry with Applications in High Dimensional Online Decision Making},\n author = {Han, Yuxuan and Liang, Zhicong and Liang, Zhipeng and Wang, Yang and Yao, Yuan and Zhang, Jiheng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8249--8279},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/han22d/han22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/han22d.html},\n abstract = \t {Differentially private (DP) stochastic convex optimization (SCO) is ubiquitous in trustworthy machine learning algorithm design. This paper studies the DP-SCO problem with streaming data sampled from a distribution and arrives sequentially. We also consider the continual release model where parameters related to private information are updated and released upon each new data. Numerous algorithms have been developed to achieve optimal excess risks in different $\\ell_p$ norm geometries, but none of the existing ones can be adapted to the streaming and continual release setting. We propose a private variant of the Frank-Wolfe algorithm with recursive gradients for variance reduction to update and reveal the parameters upon each data. Combined with the adaptive DP analysis, our algorithm achieves the first optimal excess risk in linear time in the case $1", "pdf": "https://proceedings.mlr.press/v162/han22d/han22d.pdf", "supp": "", "pdf_size": 509112, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7518092985984671259&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Mathematics; Department of Mathematics; Department of Mathematics + Department of Industrial Engineering and Decision Analytics; Department of Mathematics + Department of Industrial Engineering and Decision Analytics; Department of Mathematics; Department of Mathematics + Department of Industrial Engineering and Decision Analytics", "aff_domain": "ust.hk;ust.hk;ust.hk;ust.hk;ust.hk;ust.hk", "email": "ust.hk;ust.hk;ust.hk;ust.hk;ust.hk;ust.hk", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/han22d.html", "aff_unique_index": "0;0;0+1;0+1;0;0+1", "aff_unique_norm": "Mathematics Department;Department of Industrial Engineering and Decision Analytics", "aff_unique_dep": "Department of Mathematics;Industrial Engineering and Decision Analytics", "aff_unique_url": ";", "aff_unique_abbr": ";", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": ";;", "aff_country_unique": "" }, { "title": "Private frequency estimation via projective geometry", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17329", "id": "17329", "proceeding": "https://proceedings.mlr.press/v162/feldman22a.html", "poster": "/media/PosterPDFs/ICML%202022/597c7b407a02cc0a92167e7a371eca25.png?t=1657652661.1814086", "slides": "", "author_site": "Vitaly Feldman, Jelani Nelson, Huy Nguyen, Kunal Talwar", "author": "Vitaly Feldman; Jelani Nelson; Huy Nguyen; Kunal Talwar", "abstract": "In this work, we propose a new algorithm ProjectiveGeometryResponse (PGR) for locally differentially private (LDP) frequency estimation. For universe size of k and with n users, our eps-LDP algorithm has communication cost ceil(log_2 k) and computation cost O(n + k\\exp(eps) log k) for the server to approximately reconstruct the frequency histogram, while achieve optimal privacy-utility tradeoff. In many practical settings this is a significant improvement over the O\u00a0(n+k^2) computation cost that is achieved by the recent PI-RAPPOR algorithm (Feldman and Talwar; 2021). Our empirical evaluation shows a speedup of over 50x over PI-RAPPOR while using approximately 75x less memory. In addition, the running time of our algorithm is comparable to that of HadamardResponse (Acharya, Sun, and Zhang; 2019) and RecursiveHadamardResponse (Chen, Kairouz, and Ozgur; 2020) which have significantly worse reconstruction error. The error of our algorithm essentially matches that of the communication- and time-inefficient but utility-optimal SubsetSelection (SS) algorithm (Ye and Barg; 2017). Our new algorithm is based on using Projective Planes over a finite field to define a small collection of sets that are close to being pairwise independent and a dynamic programming algorithm for approximate histogram reconstruction for the server.", "bibtex": "@InProceedings{pmlr-v162-feldman22a,\n title = \t {Private frequency estimation via projective geometry},\n author = {Feldman, Vitaly and Nelson, Jelani and Nguyen, Huy and Talwar, Kunal},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6418--6433},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/feldman22a/feldman22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/feldman22a.html},\n abstract = \t {In this work, we propose a new algorithm ProjectiveGeometryResponse (PGR) for locally differentially private (LDP) frequency estimation. For universe size of k and with n users, our eps-LDP algorithm has communication cost ceil(log_2 k) and computation cost O(n + k\\exp(eps) log k) for the server to approximately reconstruct the frequency histogram, while achieve optimal privacy-utility tradeoff. In many practical settings this is a significant improvement over the O\u00a0(n+k^2) computation cost that is achieved by the recent PI-RAPPOR algorithm (Feldman and Talwar; 2021). Our empirical evaluation shows a speedup of over 50x over PI-RAPPOR while using approximately 75x less memory. In addition, the running time of our algorithm is comparable to that of HadamardResponse (Acharya, Sun, and Zhang; 2019) and RecursiveHadamardResponse (Chen, Kairouz, and Ozgur; 2020) which have significantly worse reconstruction error. The error of our algorithm essentially matches that of the communication- and time-inefficient but utility-optimal SubsetSelection (SS) algorithm (Ye and Barg; 2017). Our new algorithm is based on using Projective Planes over a finite field to define a small collection of sets that are close to being pairwise independent and a dynamic programming algorithm for approximate histogram reconstruction for the server.}\n}", "pdf": "https://proceedings.mlr.press/v162/feldman22a/feldman22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/feldman22a-supp.zip", "pdf_size": 851611, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5605547034926514625&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Apple, Cupertino, CA, USA+UC Berkeley, CA, USA; UC Berkeley, CA, USA; Northeastern University, MA, USA; Apple, Cupertino, CA, USA", "aff_domain": "gmail.com;berkeley.edu;northeastern.edu;kunaltalwar.org", "email": "gmail.com;berkeley.edu;northeastern.edu;kunaltalwar.org", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/feldman22a.html", "aff_unique_index": "0+1;1;2;0", "aff_unique_norm": "Apple;University of California, Berkeley;Northeastern University", "aff_unique_dep": "Apple Inc.;;", "aff_unique_url": "https://www.apple.com;https://www.berkeley.edu;https://www.northeastern.edu", "aff_unique_abbr": "Apple;UC Berkeley;NEU", "aff_campus_unique_index": "0+1;1;2;0", "aff_campus_unique": "Cupertino;Berkeley;MA", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Private optimization in the interpolation regime: faster rates and hardness results", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17981", "id": "17981", "proceeding": "https://proceedings.mlr.press/v162/asi22a.html", "poster": "", "slides": "", "author_site": "Hilal Asi, Karan Chadha, Gary Cheng, John Duchi", "author": "Hilal Asi; Karan Chadha; Gary Cheng; John Duchi", "abstract": "In non-private stochastic convex optimization, stochastic gradient methods converge much faster on interpolation problems\u2014namely, problems where there exists a solution that simultaneously minimizes all of the sample losses\u2014than on non-interpolating ones; similar improvements are not known in the private setting. In this paper, we investigate differentially private stochastic optimization in the interpolation regime. First, we show that without additional assumptions, interpolation problems do not exhibit an improved convergence rates with differential privacy. However, when the functions exhibit quadratic growth around the optimum, we show (near) exponential improvements in the private sample complexity. In particular, we propose an adaptive algorithm that improves the sample complexity to achieve expected error $\\alpha$ from $\\frac{d}{\\diffp \\sqrt{\\alpha}}$ to $\\frac{1}{\\alpha^\\rho} + \\frac{d}{\\diffp} \\log\\paren{\\frac{1}{\\alpha}}$ for any fixed $\\rho >0$, while retaining the standard minimax-optimal sample complexity for non-interpolation problems. We prove a lower bound that shows the dimension-dependent term in the expression above is tight. Furthermore, we provide a superefficiency result which demonstrates the necessity of the polynomial term for adaptive algorithms: any algorithm that has a polylogarithmic sample complexity for interpolation problems cannot achieve the minimax-optimal rates for the family of non-interpolation problems.", "bibtex": "@InProceedings{pmlr-v162-asi22a,\n title = \t {Private optimization in the interpolation regime: faster rates and hardness results},\n author = {Asi, Hilal and Chadha, Karan and Cheng, Gary and Duchi, John},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1025--1045},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/asi22a/asi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/asi22a.html},\n abstract = \t {In non-private stochastic convex optimization, stochastic gradient methods converge much faster on interpolation problems\u2014namely, problems where there exists a solution that simultaneously minimizes all of the sample losses\u2014than on non-interpolating ones; similar improvements are not known in the private setting. In this paper, we investigate differentially private stochastic optimization in the interpolation regime. First, we show that without additional assumptions, interpolation problems do not exhibit an improved convergence rates with differential privacy. However, when the functions exhibit quadratic growth around the optimum, we show (near) exponential improvements in the private sample complexity. In particular, we propose an adaptive algorithm that improves the sample complexity to achieve expected error $\\alpha$ from $\\frac{d}{\\diffp \\sqrt{\\alpha}}$ to $\\frac{1}{\\alpha^\\rho} + \\frac{d}{\\diffp} \\log\\paren{\\frac{1}{\\alpha}}$ for any fixed $\\rho >0$, while retaining the standard minimax-optimal sample complexity for non-interpolation problems. We prove a lower bound that shows the dimension-dependent term in the expression above is tight. Furthermore, we provide a superefficiency result which demonstrates the necessity of the polynomial term for adaptive algorithms: any algorithm that has a polylogarithmic sample complexity for interpolation problems cannot achieve the minimax-optimal rates for the family of non-interpolation problems.}\n}", "pdf": "https://proceedings.mlr.press/v162/asi22a/asi22a.pdf", "supp": "", "pdf_size": 401222, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9926596144495133601&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Electrical Engineering, Stanford University; Department of Electrical Engineering, Stanford University; Department of Electrical Engineering, Stanford University; Department of Electrical Engineering, Stanford University + Department of Statistics, Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;stanford.edu", "email": "stanford.edu;stanford.edu;stanford.edu;stanford.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/asi22a.html", "aff_unique_index": "0;0;0;0+0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0+0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "ProGCL: Rethinking Hard Negative Mining in Graph Contrastive Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17577", "id": "17577", "proceeding": "https://proceedings.mlr.press/v162/xia22b.html", "poster": "/media/PosterPDFs/ICML%202022/c0c7c76d30bd3dcaefc96f40275bdc0a.png?t=1657698405.5629005", "slides": "", "author_site": "Jun Xia, Lirong Wu, Wang Ge, Jintao Chen, Stan Z. Li", "author": "Jun Xia; Lirong Wu; Ge Wang; Jintao Chen; Stan Z. Li", "abstract": "Contrastive Learning (CL) has emerged as a dominant technique for unsupervised representation learning which embeds augmented versions of the anchor close to each other (positive samples) and pushes the embeddings of other samples (negatives) apart. As revealed in recent studies, CL can benefit from hard negatives (negatives that are most similar to the anchor). However, we observe limited benefits when we adopt existing hard negative mining techniques of other domains in Graph Contrastive Learning (GCL). We perform both experimental and theoretical analysis on this phenomenon and find it can be attributed to the message passing of Graph Neural Networks (GNNs). Unlike CL in other domains, most hard negatives are potentially false negatives (negatives that share the same class with the anchor) if they are selected merely according to the similarities between anchor and themselves, which will undesirably push away the samples of the same class. To remedy this deficiency, we propose an effective method, dubbed \\textbf{ProGCL}, to estimate the probability of a negative being true one, which constitutes a more suitable measure for negatives\u2019 hardness together with similarity. Additionally, we devise two schemes (i.e., \\textbf{ProGCL-weight} and \\textbf{ProGCL-mix}) to boost the performance of GCL. Extensive experiments demonstrate that ProGCL brings notable and consistent improvements over base GCL methods and yields multiple state-of-the-art results on several unsupervised benchmarks or even exceeds the performance of supervised ones. Also, ProGCL is readily pluggable into various negatives-based GCL methods for performance improvement. We release the code at \\textcolor{magenta}\\url{https://github.com/junxia97/ProGCL}.", "bibtex": "@InProceedings{pmlr-v162-xia22b,\n title = \t {{P}ro{GCL}: Rethinking Hard Negative Mining in Graph Contrastive Learning},\n author = {Xia, Jun and Wu, Lirong and Wang, Ge and Chen, Jintao and Li, Stan Z.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24332--24346},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xia22b/xia22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/xia22b.html},\n abstract = \t {Contrastive Learning (CL) has emerged as a dominant technique for unsupervised representation learning which embeds augmented versions of the anchor close to each other (positive samples) and pushes the embeddings of other samples (negatives) apart. As revealed in recent studies, CL can benefit from hard negatives (negatives that are most similar to the anchor). However, we observe limited benefits when we adopt existing hard negative mining techniques of other domains in Graph Contrastive Learning (GCL). We perform both experimental and theoretical analysis on this phenomenon and find it can be attributed to the message passing of Graph Neural Networks (GNNs). Unlike CL in other domains, most hard negatives are potentially false negatives (negatives that share the same class with the anchor) if they are selected merely according to the similarities between anchor and themselves, which will undesirably push away the samples of the same class. To remedy this deficiency, we propose an effective method, dubbed \\textbf{ProGCL}, to estimate the probability of a negative being true one, which constitutes a more suitable measure for negatives\u2019 hardness together with similarity. Additionally, we devise two schemes (i.e., \\textbf{ProGCL-weight} and \\textbf{ProGCL-mix}) to boost the performance of GCL. Extensive experiments demonstrate that ProGCL brings notable and consistent improvements over base GCL methods and yields multiple state-of-the-art results on several unsupervised benchmarks or even exceeds the performance of supervised ones. Also, ProGCL is readily pluggable into various negatives-based GCL methods for performance improvement. We release the code at \\textcolor{magenta}\\url{https://github.com/junxia97/ProGCL}.}\n}", "pdf": "https://proceedings.mlr.press/v162/xia22b/xia22b.pdf", "supp": "", "pdf_size": 7934321, "gs_citation": 168, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13169866082987981674&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Westlake University + Westlake Institute for Advanced Study; Westlake University + Westlake Institute for Advanced Study; Westlake University + Westlake Institute for Advanced Study; School of Computer Science, Zhejiang University; Westlake University + Westlake Institute for Advanced Study", "aff_domain": "westlake.edu.cn; ; ; ; ", "email": "westlake.edu.cn; ; ; ; ", "github": "https://github.com/junxia97/ProGCL", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/xia22b.html", "aff_unique_index": "0+1;0+1;0+1;2;0+1", "aff_unique_norm": "Westlake University;Westlake Institute for Advanced Study;Zhejiang University", "aff_unique_dep": ";;School of Computer Science", "aff_unique_url": "https://www.westlake.edu.cn;https://www.westlake.edu.cn;http://www.zju.edu.cn", "aff_unique_abbr": "WU;WIAS;ZJU", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0;0;0+0", "aff_country_unique": "China" }, { "title": "Probabilistic Bilevel Coreset Selection", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17891", "id": "17891", "proceeding": "https://proceedings.mlr.press/v162/zhou22h.html", "poster": "/media/PosterPDFs/ICML%202022/30ef30b64204a3088a26bc2e6ecf7602_GNxEqCi.png?t=1657857863.0091252", "slides": "", "author_site": "Xiao Zhou, Renjie Pi, Weizhong Zhang, Yong LIN, Zonghao Chen, Tong Zhang", "author": "Xiao Zhou; Renjie Pi; Weizhong Zhang; Yong Lin; Zonghao Chen; Tong Zhang", "abstract": "The goal of coreset selection in supervised learning is to produce a weighted subset of data, so that training only on the subset achieves similar performance as training on the entire dataset. Existing methods achieved promising results in resource-constrained scenarios such as continual learning and streaming. However, most of the existing algorithms are limited to traditional machine learning models. A few algorithms that can handle large models adopt greedy search approaches due to the difficulty in solving the discrete subset selection problem, which is computationally costly when coreset becomes larger and often produces suboptimal results. In this work, for the first time we propose a continuous probabilistic bilevel formulation of coreset selection by learning a probablistic weight for each training sample. The overall objective is posed as a bilevel optimization problem, where 1) the inner loop samples coresets and train the model to convergence and 2) the outer loop updates the sample probability progressively according to the model\u2019s performance. Importantly, we develop an efficient solver to the bilevel optimization problem via unbiased policy gradient without trouble of implicit differentiation. We theoretically prove the convergence of this training procedure and demonstrate the superiority of our algorithm against various coreset selection methods in various tasks, especially in more challenging label-noise and class-imbalance scenarios.", "bibtex": "@InProceedings{pmlr-v162-zhou22h,\n title = \t {Probabilistic Bilevel Coreset Selection},\n author = {Zhou, Xiao and Pi, Renjie and Zhang, Weizhong and Lin, Yong and Chen, Zonghao and Zhang, Tong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27287--27302},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22h/zhou22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22h.html},\n abstract = \t {The goal of coreset selection in supervised learning is to produce a weighted subset of data, so that training only on the subset achieves similar performance as training on the entire dataset. Existing methods achieved promising results in resource-constrained scenarios such as continual learning and streaming. However, most of the existing algorithms are limited to traditional machine learning models. A few algorithms that can handle large models adopt greedy search approaches due to the difficulty in solving the discrete subset selection problem, which is computationally costly when coreset becomes larger and often produces suboptimal results. In this work, for the first time we propose a continuous probabilistic bilevel formulation of coreset selection by learning a probablistic weight for each training sample. The overall objective is posed as a bilevel optimization problem, where 1) the inner loop samples coresets and train the model to convergence and 2) the outer loop updates the sample probability progressively according to the model\u2019s performance. Importantly, we develop an efficient solver to the bilevel optimization problem via unbiased policy gradient without trouble of implicit differentiation. We theoretically prove the convergence of this training procedure and demonstrate the superiority of our algorithm against various coreset selection methods in various tasks, especially in more challenging label-noise and class-imbalance scenarios.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22h/zhou22h.pdf", "supp": "", "pdf_size": 1836456, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13987769128031921166&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhou22h.html" }, { "title": "Probabilistic ODE Solutions in Millions of Dimensions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16335", "id": "16335", "proceeding": "https://proceedings.mlr.press/v162/kramer22b.html", "poster": "/media/PosterPDFs/ICML%202022/403ea2e851b9ab04a996beab4a480a30.png?t=1657523214.2379231", "slides": "", "author_site": "Nicholas Kr\u00e4mer, Nathanael Bosch, Jonathan Schmidt, Philipp Hennig", "author": "Nicholas Kr\u00e4mer; Nathanael Bosch; Jonathan Schmidt; Philipp Hennig", "abstract": "Probabilistic solvers for ordinary differential equations (ODEs) have emerged as an efficient framework for uncertainty quantification and inference on dynamical systems. In this work, we explain the mathematical assumptions and detailed implementation schemes behind solving high-dimensional ODEs with a probabilistic numerical algorithm. This has not been possible before due to matrix-matrix operations in each solver step, but is crucial for scientifically relevant problems\u2014most importantly, the solution of discretised partial differential equations. In a nutshell, efficient high-dimensional probabilistic ODE solutions build either on independence assumptions or on Kronecker structure in the prior model. We evaluate the resulting efficiency on a range of problems, including the probabilistic numerical simulation of a differential equation with millions of dimensions.", "bibtex": "@InProceedings{pmlr-v162-kramer22b,\n title = \t {Probabilistic {ODE} Solutions in Millions of Dimensions},\n author = {Kr{\\\"a}mer, Nicholas and Bosch, Nathanael and Schmidt, Jonathan and Hennig, Philipp},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11634--11649},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kramer22b/kramer22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/kramer22b.html},\n abstract = \t {Probabilistic solvers for ordinary differential equations (ODEs) have emerged as an efficient framework for uncertainty quantification and inference on dynamical systems. In this work, we explain the mathematical assumptions and detailed implementation schemes behind solving high-dimensional ODEs with a probabilistic numerical algorithm. This has not been possible before due to matrix-matrix operations in each solver step, but is crucial for scientifically relevant problems\u2014most importantly, the solution of discretised partial differential equations. In a nutshell, efficient high-dimensional probabilistic ODE solutions build either on independence assumptions or on Kronecker structure in the prior model. We evaluate the resulting efficiency on a range of problems, including the probabilistic numerical simulation of a differential equation with millions of dimensions.}\n}", "pdf": "https://proceedings.mlr.press/v162/kramer22b/kramer22b.pdf", "supp": "", "pdf_size": 1948753, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17865414228264739501&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of T\u00fcbingen, T\u00fcbingen, Germany+Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; University of T\u00fcbingen, T\u00fcbingen, Germany+Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; University of T\u00fcbingen, T\u00fcbingen, Germany+Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany; University of T\u00fcbingen, T\u00fcbingen, Germany+Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de; ", "email": "uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/kramer22b.html", "aff_unique_index": "0+1;0+1;0+1;0+1", "aff_unique_norm": "University of T\u00fcbingen;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.mpi-is.mpg.de", "aff_unique_abbr": "Uni T\u00fcbingen;MPI-IS", "aff_campus_unique_index": "0+0;0+0;0+0;0+0", "aff_campus_unique": "T\u00fcbingen", "aff_country_unique_index": "0+0;0+0;0+0;0+0", "aff_country_unique": "Germany" }, { "title": "Probabilistically Robust Learning: Balancing Average and Worst-case Performance", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16783", "id": "16783", "proceeding": "https://proceedings.mlr.press/v162/robey22a.html", "poster": "", "slides": "", "author_site": "Alex Robey, Luiz F. O. Chamon, George J. Pappas, Hamed Hassani", "author": "Alexander Robey; Luiz Chamon; George J. Pappas; Hamed Hassani", "abstract": "Many of the successes of machine learning are based on minimizing an averaged loss function. However, it is well-known that this paradigm suffers from robustness issues that hinder its applicability in safety-critical domains. These issues are often addressed by training against worst-case perturbations of data, a technique known as adversarial training. Although empirically effective, adversarial training can be overly conservative, leading to unfavorable trade-offs between nominal performance and robustness. To this end, in this paper we propose a framework called probabilistic robustness that bridges the gap between the accurate, yet brittle average case and the robust, yet conservative worst case by enforcing robustness to most rather than to all perturbations. From a theoretical point of view, this framework overcomes the trade-offs between the performance and the sample-complexity of worst-case and average-case learning. From a practical point of view, we propose a novel algorithm based on risk-aware optimization that effectively balances average- and worst-case performance at a considerably lower computational cost relative to adversarial training. Our results on MNIST, CIFAR-10, and SVHN illustrate the advantages of this framework on the spectrum from average- to worst-case robustness. Our code is available at: https://github.com/arobey1/advbench.", "bibtex": "@InProceedings{pmlr-v162-robey22a,\n title = \t {Probabilistically Robust Learning: Balancing Average and Worst-case Performance},\n author = {Robey, Alexander and Chamon, Luiz and Pappas, George J. and Hassani, Hamed},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18667--18686},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/robey22a/robey22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/robey22a.html},\n abstract = \t {Many of the successes of machine learning are based on minimizing an averaged loss function. However, it is well-known that this paradigm suffers from robustness issues that hinder its applicability in safety-critical domains. These issues are often addressed by training against worst-case perturbations of data, a technique known as adversarial training. Although empirically effective, adversarial training can be overly conservative, leading to unfavorable trade-offs between nominal performance and robustness. To this end, in this paper we propose a framework called probabilistic robustness that bridges the gap between the accurate, yet brittle average case and the robust, yet conservative worst case by enforcing robustness to most rather than to all perturbations. From a theoretical point of view, this framework overcomes the trade-offs between the performance and the sample-complexity of worst-case and average-case learning. From a practical point of view, we propose a novel algorithm based on risk-aware optimization that effectively balances average- and worst-case performance at a considerably lower computational cost relative to adversarial training. Our results on MNIST, CIFAR-10, and SVHN illustrate the advantages of this framework on the spectrum from average- to worst-case robustness. Our code is available at: https://github.com/arobey1/advbench.}\n}", "pdf": "https://proceedings.mlr.press/v162/robey22a/robey22a.pdf", "supp": "", "pdf_size": 1080529, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12042354496298594983&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Electrical and Systems Engineering, University of Pennsylvania, Philadelphia, PA, USA+University of California, Berkeley, Berkeley, CA, USA; University of California, Berkeley, Berkeley, CA, USA; Department of Electrical and Systems Engineering, University of Pennsylvania, Philadelphia, PA, USA; Department of Electrical and Systems Engineering, University of Pennsylvania, Philadelphia, PA, USA", "aff_domain": "seas.upenn.edu; ; ; ", "email": "seas.upenn.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/robey22a.html", "aff_unique_index": "0+1;1;0;0", "aff_unique_norm": "University of Pennsylvania;University of California, Berkeley", "aff_unique_dep": "Department of Electrical and Systems Engineering;", "aff_unique_url": "https://www.upenn.edu;https://www.berkeley.edu", "aff_unique_abbr": "UPenn;UC Berkeley", "aff_campus_unique_index": "0+1;1;0;0", "aff_campus_unique": "Philadelphia;Berkeley", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "ProgFed: Effective, Communication, and Computation Efficient Federated Learning by Progressive Training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16193", "id": "16193", "proceeding": "https://proceedings.mlr.press/v162/wang22y.html", "poster": "", "slides": "", "author_site": "Hui-Po Wang, Sebastian Stich, Yang He, Mario Fritz", "author": "Hui-Po Wang; Sebastian Stich; Yang He; Mario Fritz", "abstract": "Federated learning is a powerful distributed learning scheme that allows numerous edge devices to collaboratively train a model without sharing their data. However, training is resource-intensive for edge devices, and limited network bandwidth is often the main bottleneck. Prior work often overcomes the constraints by condensing the models or messages into compact formats, e.g., by gradient compression or distillation. In contrast, we propose ProgFed, the first progressive training framework for efficient and effective federated learning. It inherently reduces computation and two-way communication costs while maintaining the strong performance of the final models. We theoretically prove that ProgFed converges at the same asymptotic rate as standard training on full models. Extensive results on a broad range of architectures, including CNNs (VGG, ResNet, ConvNets) and U-nets, and diverse tasks from simple classification to medical image segmentation show that our highly effective training approach saves up to $20%$ computation and up to $63%$ communication costs for converged models. As our approach is also complimentary to prior work on compression, we can achieve a wide range of trade-offs by combining these techniques, showing reduced communication of up to $50\\times$ at only $0.1%$ loss in utility. Code is available at https://github.com/a514514772/ProgFed.", "bibtex": "@InProceedings{pmlr-v162-wang22y,\n title = \t {{P}rog{F}ed: Effective, Communication, and Computation Efficient Federated Learning by Progressive Training},\n author = {Wang, Hui-Po and Stich, Sebastian and He, Yang and Fritz, Mario},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23034--23054},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22y/wang22y.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22y.html},\n abstract = \t {Federated learning is a powerful distributed learning scheme that allows numerous edge devices to collaboratively train a model without sharing their data. However, training is resource-intensive for edge devices, and limited network bandwidth is often the main bottleneck. Prior work often overcomes the constraints by condensing the models or messages into compact formats, e.g., by gradient compression or distillation. In contrast, we propose ProgFed, the first progressive training framework for efficient and effective federated learning. It inherently reduces computation and two-way communication costs while maintaining the strong performance of the final models. We theoretically prove that ProgFed converges at the same asymptotic rate as standard training on full models. Extensive results on a broad range of architectures, including CNNs (VGG, ResNet, ConvNets) and U-nets, and diverse tasks from simple classification to medical image segmentation show that our highly effective training approach saves up to $20%$ computation and up to $63%$ communication costs for converged models. As our approach is also complimentary to prior work on compression, we can achieve a wide range of trade-offs by combining these techniques, showing reduced communication of up to $50\\times$ at only $0.1%$ loss in utility. Code is available at https://github.com/a514514772/ProgFed.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22y/wang22y.pdf", "supp": "", "pdf_size": 3371597, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14093452975120098193&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "CISPA Helmholz Center for Information Security, Germany; CISPA Helmholz Center for Information Security, Germany; CISPA Helmholz Center for Information Security, Germany; CISPA Helmholz Center for Information Security, Germany", "aff_domain": "cispa.de; ; ; ", "email": "cispa.de; ; ; ", "github": "https://github.com/a514514772/ProgFed", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/wang22y.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "CISPA Helmholz Center for Information Security", "aff_unique_dep": "", "aff_unique_url": "https://www.cispa.de/", "aff_unique_abbr": "CISPA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Prompting Decision Transformer for Few-Shot Policy Generalization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18153", "id": "18153", "proceeding": "https://proceedings.mlr.press/v162/xu22g.html", "poster": "/media/PosterPDFs/ICML%202022/15b3342aa0abd5176b93d68ddf95e3ce.png?t=1657653137.4371893", "slides": "", "author_site": "Mengdi Xu, Yikang Shen, Shun Zhang, Yuchen Lu, Ding Zhao, Josh Tenenbaum, Chuang Gan", "author": "Mengdi Xu; Yikang Shen; Shun Zhang; Yuchen Lu; Ding Zhao; Joshua Tenenbaum; Chuang Gan", "abstract": "Human can leverage prior experience and learn novel tasks from a handful of demonstrations. In contrast to offline meta-reinforcement learning, which aims to achieve quick adaptation through better algorithm design, we investigate the effect of architecture inductive bias on the few-shot learning capability. We propose a Prompt-based Decision Transformer (Prompt-DT), which leverages the sequential modeling ability of the Transformer architecture and the prompt framework to achieve few-shot adaptation in offline RL. We design the trajectory prompt, which contains segments of the few-shot demonstrations, and encodes task-specific information to guide policy generation. Our experiments in five MuJoCo control benchmarks show that Prompt-DT is a strong few-shot learner without any extra finetuning on unseen target tasks. Prompt-DT outperforms its variants and strong meta offline RL baselines by a large margin with a trajectory prompt containing only a few timesteps. Prompt-DT is also robust to prompt length changes and can generalize to out-of-distribution (OOD) environments. Project page: \\href{https://mxu34.github.io/PromptDT/}{https://mxu34.github.io/PromptDT/}.", "bibtex": "@InProceedings{pmlr-v162-xu22g,\n title = \t {Prompting Decision Transformer for Few-Shot Policy Generalization},\n author = {Xu, Mengdi and Shen, Yikang and Zhang, Shun and Lu, Yuchen and Zhao, Ding and Tenenbaum, Joshua and Gan, Chuang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24631--24645},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22g/xu22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22g.html},\n abstract = \t {Human can leverage prior experience and learn novel tasks from a handful of demonstrations. In contrast to offline meta-reinforcement learning, which aims to achieve quick adaptation through better algorithm design, we investigate the effect of architecture inductive bias on the few-shot learning capability. We propose a Prompt-based Decision Transformer (Prompt-DT), which leverages the sequential modeling ability of the Transformer architecture and the prompt framework to achieve few-shot adaptation in offline RL. We design the trajectory prompt, which contains segments of the few-shot demonstrations, and encodes task-specific information to guide policy generation. Our experiments in five MuJoCo control benchmarks show that Prompt-DT is a strong few-shot learner without any extra finetuning on unseen target tasks. Prompt-DT outperforms its variants and strong meta offline RL baselines by a large margin with a trajectory prompt containing only a few timesteps. Prompt-DT is also robust to prompt length changes and can generalize to out-of-distribution (OOD) environments. Project page: \\href{https://mxu34.github.io/PromptDT/}{https://mxu34.github.io/PromptDT/}.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22g/xu22g.pdf", "supp": "", "pdf_size": 4021130, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1409733122996470399&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Carnegie Mellon University; University of Montreal, Mila; MIT-IBM Watson AI Lab; University of Montreal, Mila; Carnegie Mellon University; Massachusetts Institute of Technology; MIT-IBM Watson AI Lab+UMass Amherst", "aff_domain": "andrew.cmu.edu; ; ; ; ; ; ", "email": "andrew.cmu.edu; ; ; ; ; ; ", "github": "", "project": "https://mxu34.github.io/PromptDT/", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/xu22g.html", "aff_unique_index": "0;1;2;1;0;2;2+3", "aff_unique_norm": "Carnegie Mellon University;University of Montreal;Massachusetts Institute of Technology;University of Massachusetts Amherst", "aff_unique_dep": ";Mila;IBM Watson AI Lab;", "aff_unique_url": "https://www.cmu.edu;https://www.mila.quebec;https://www.mitibmwatsonailab.org;https://www.umass.edu", "aff_unique_abbr": "CMU;UM;MIT-IBM AI Lab;UMass Amherst", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;1;0;1;0;0;0+0", "aff_country_unique": "United States;Canada" }, { "title": "Prototype Based Classification from Hierarchy to Fairness", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16089", "id": "16089", "proceeding": "https://proceedings.mlr.press/v162/tucker22a.html", "poster": "/media/PosterPDFs/ICML%202022/a03fa30821986dff10fc66647c84c9c3.png?t=1657516240.9506538", "slides": "", "author_site": "Mycal Tucker, Julie Shah", "author": "Mycal Tucker; Julie A. Shah", "abstract": "Artificial neural nets can represent and classify many types of high-dimensional data but are often tailored to particular applications \u2013 e.g., for \u201cfair\u201d or \u201chierarchical\u201d classification. Once an architecture has been selected, it is often difficult for humans to adjust models for a new task; for example, a hierarchical classifier cannot be easily transformed into a fair classifier that shields a protected field. Our contribution in this work is a new neural network architecture, the concept subspace network (CSN), which generalizes existing specialized classifiers to produce a unified model capable of learning a spectrum of multi-concept relationships. We demonstrate that CSNs reproduce state-of-the-art results in fair classification when enforcing concept independence, may be transformed into hierarchical classifiers, or may even reconcile fairness and hierarchy within a single classifier. The CSN is inspired by and matches the performance of existing prototype-based classifiers that promote interpretability.", "bibtex": "@InProceedings{pmlr-v162-tucker22a,\n title = \t {Prototype Based Classification from Hierarchy to Fairness},\n author = {Tucker, Mycal and Shah, Julie A.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21884--21900},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tucker22a/tucker22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tucker22a.html},\n abstract = \t {Artificial neural nets can represent and classify many types of high-dimensional data but are often tailored to particular applications \u2013 e.g., for \u201cfair\u201d or \u201chierarchical\u201d classification. Once an architecture has been selected, it is often difficult for humans to adjust models for a new task; for example, a hierarchical classifier cannot be easily transformed into a fair classifier that shields a protected field. Our contribution in this work is a new neural network architecture, the concept subspace network (CSN), which generalizes existing specialized classifiers to produce a unified model capable of learning a spectrum of multi-concept relationships. We demonstrate that CSNs reproduce state-of-the-art results in fair classification when enforcing concept independence, may be transformed into hierarchical classifiers, or may even reconcile fairness and hierarchy within a single classifier. The CSN is inspired by and matches the performance of existing prototype-based classifiers that promote interpretability.}\n}", "pdf": "https://proceedings.mlr.press/v162/tucker22a/tucker22a.pdf", "supp": "", "pdf_size": 1460024, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11530419927101336822&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "CSAIL, Massachusetts Institute of Technology, Massachusetts, USA; CSAIL, Massachusetts Institute of Technology, Massachusetts, USA", "aff_domain": "mit.edu; ", "email": "mit.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/tucker22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.csail.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Massachusetts", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Prototype-Anchored Learning for Learning with Imperfect Annotations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17177", "id": "17177", "proceeding": "https://proceedings.mlr.press/v162/zhou22f.html", "poster": "/media/PosterPDFs/ICML%202022/28dd2c7955ce926456240b2ff0100bde.png?t=1656167494.583342", "slides": "", "author_site": "Xiong Zhou, Xianming Liu, Deming Zhai, Junjun Jiang, Xin Gao, Xiangyang Ji", "author": "Xiong Zhou; Xianming Liu; Deming Zhai; Junjun Jiang; Xin Gao; Xiangyang Ji", "abstract": "The success of deep neural networks greatly relies on the availability of large amounts of high-quality annotated data, which however are difficult or expensive to obtain. The resulting labels may be class imbalanced, noisy or human biased. It is challenging to learn unbiased classification models from imperfectly annotated datasets, on which we usually suffer from overfitting or underfitting. In this work, we thoroughly investigate the popular softmax loss and margin-based loss, and offer a feasible approach to tighten the generalization error bound by maximizing the minimal sample margin. We further derive the optimality condition for this purpose, which indicates how the class prototypes should be anchored. Motivated by theoretical analysis, we propose a simple yet effective method, namely prototype-anchored learning (PAL), which can be easily incorporated into various learning-based classification schemes to handle imperfect annotation. We verify the effectiveness of PAL on class-imbalanced learning and noise-tolerant learning by extensive experiments on synthetic and real-world datasets.", "bibtex": "@InProceedings{pmlr-v162-zhou22f,\n title = \t {Prototype-Anchored Learning for Learning with Imperfect Annotations},\n author = {Zhou, Xiong and Liu, Xianming and Zhai, Deming and Jiang, Junjun and Gao, Xin and Ji, Xiangyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27245--27267},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22f/zhou22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22f.html},\n abstract = \t {The success of deep neural networks greatly relies on the availability of large amounts of high-quality annotated data, which however are difficult or expensive to obtain. The resulting labels may be class imbalanced, noisy or human biased. It is challenging to learn unbiased classification models from imperfectly annotated datasets, on which we usually suffer from overfitting or underfitting. In this work, we thoroughly investigate the popular softmax loss and margin-based loss, and offer a feasible approach to tighten the generalization error bound by maximizing the minimal sample margin. We further derive the optimality condition for this purpose, which indicates how the class prototypes should be anchored. Motivated by theoretical analysis, we propose a simple yet effective method, namely prototype-anchored learning (PAL), which can be easily incorporated into various learning-based classification schemes to handle imperfect annotation. We verify the effectiveness of PAL on class-imbalanced learning and noise-tolerant learning by extensive experiments on synthetic and real-world datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22f/zhou22f.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/zhou22f-supp.zip", "pdf_size": 14082343, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1584558154247096344&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhou22f.html" }, { "title": "Provable Acceleration of Heavy Ball beyond Quadratics for a Class of Polyak-Lojasiewicz Functions when the Non-Convexity is Averaged-Out", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16517", "id": "16517", "proceeding": "https://proceedings.mlr.press/v162/wang22p.html", "poster": "/media/PosterPDFs/ICML%202022/0a54b19a13b6712dc04d1b49215423d8_KdK1pUN.png?t=1657486382.3933916", "slides": "", "author_site": "Jun-Kun Wang, Chi-Heng Lin, Andre Wibisono, Bin Hu", "author": "Jun-Kun Wang; Chi-Heng Lin; Andre Wibisono; Bin Hu", "abstract": "Heavy Ball (HB) nowadays is one of the most popular momentum methods in non-convex optimization. It has been widely observed that incorporating the Heavy Ball dynamic in gradient-based methods accelerates the training process of modern machine learning models. However, the progress on establishing its theoretical foundation of acceleration is apparently far behind its empirical success. Existing provable acceleration results are of the quadratic or close-to-quadratic functions, as the current techniques of showing HB\u2019s acceleration are limited to the case when the Hessian is fixed. In this work, we develop some new techniques that help show acceleration beyond quadratics, which is achieved by analyzing how the change of the Hessian at two consecutive time points affects the convergence speed. Based on our technical results, a class of Polyak-Lojasiewicz (PL) optimization problems for which provable acceleration can be achieved via HB is identified. Moreover, our analysis demonstrates a benefit of adaptively setting the momentum parameter.", "bibtex": "@InProceedings{pmlr-v162-wang22p,\n title = \t {Provable Acceleration of Heavy Ball beyond Quadratics for a Class of Polyak-Lojasiewicz Functions when the Non-Convexity is Averaged-Out},\n author = {Wang, Jun-Kun and Lin, Chi-Heng and Wibisono, Andre and Hu, Bin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22839--22864},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22p/wang22p.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22p.html},\n abstract = \t {Heavy Ball (HB) nowadays is one of the most popular momentum methods in non-convex optimization. It has been widely observed that incorporating the Heavy Ball dynamic in gradient-based methods accelerates the training process of modern machine learning models. However, the progress on establishing its theoretical foundation of acceleration is apparently far behind its empirical success. Existing provable acceleration results are of the quadratic or close-to-quadratic functions, as the current techniques of showing HB\u2019s acceleration are limited to the case when the Hessian is fixed. In this work, we develop some new techniques that help show acceleration beyond quadratics, which is achieved by analyzing how the change of the Hessian at two consecutive time points affects the convergence speed. Based on our technical results, a class of Polyak-Lojasiewicz (PL) optimization problems for which provable acceleration can be achieved via HB is identified. Moreover, our analysis demonstrates a benefit of adaptively setting the momentum parameter.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22p/wang22p.pdf", "supp": "", "pdf_size": 696480, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9497306489416011406&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Computer Science, Yale University; Electrical and Computer Engineering, Georgia Institute of Technology; Computer Science, Yale University; Electrical and Computer Engineering & Coordinated Science Laboratory, University of Illinois at Urbana-Champaign", "aff_domain": "yale.edu;gatech.edu;yale.edu;illinois.edu", "email": "yale.edu;gatech.edu;yale.edu;illinois.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/wang22p.html", "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Yale University;Georgia Institute of Technology;University of Illinois Urbana-Champaign", "aff_unique_dep": "Computer Science;Electrical and Computer Engineering;Electrical and Computer Engineering & Coordinated Science Laboratory", "aff_unique_url": "https://www.yale.edu;https://www.gatech.edu;https://www illinois.edu", "aff_unique_abbr": "Yale;Georgia Tech;UIUC", "aff_campus_unique_index": "0;1;0;2", "aff_campus_unique": "New Haven;Atlanta;Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Provable Domain Generalization via Invariant-Feature Subspace Recovery", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17765", "id": "17765", "proceeding": "https://proceedings.mlr.press/v162/wang22x.html", "poster": "/media/PosterPDFs/ICML%202022/0771fc6f0f4b1d7d1bb73bbbe14e0e31_5s72U7T.png?t=1657923353.6626046", "slides": "", "author_site": "Haoxiang Wang, Haozhe Si, Bo Li, Han Zhao", "author": "Haoxiang Wang; Haozhe Si; Bo Li; Han Zhao", "abstract": "Domain generalization asks for models trained over a set of training environments to perform well in unseen test environments. Recently, a series of algorithms such as Invariant Risk Minimization (IRM) has been proposed for domain generalization. However, Rosenfeld et al. (2021) shows that in a simple linear data model, even if non-convexity issues are ignored, IRM and its extensions cannot generalize to unseen environments with less than $d_s+1$ training environments, where $d_s$ is the dimension of the spurious-feature subspace. In this paper, we propose to achieve domain generalization with Invariant-feature Subspace Recovery (ISR). Our first algorithm, ISR-Mean, can identify the subspace spanned by invariant features from the first-order moments of the class-conditional distributions, and achieve provable domain generalization with $d_s+1$ training environments under the data model of Rosenfeld et al. (2021). Our second algorithm, ISR-Cov, further reduces the required number of training environments to $O(1)$ using the information of second-order moments. Notably, unlike IRM, our algorithms bypass non-convexity issues and enjoy global convergence guarantees. Empirically, our ISRs can obtain superior performance compared with IRM on synthetic benchmarks. In addition, on three real-world image and text datasets, we show that both ISRs can be used as simple yet effective post-processing methods to improve the worst-case accuracy of (pre-)trained models against spurious correlations and group shifts.", "bibtex": "@InProceedings{pmlr-v162-wang22x,\n title = \t {Provable Domain Generalization via Invariant-Feature Subspace Recovery},\n author = {Wang, Haoxiang and Si, Haozhe and Li, Bo and Zhao, Han},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23018--23033},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22x/wang22x.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22x.html},\n abstract = \t {Domain generalization asks for models trained over a set of training environments to perform well in unseen test environments. Recently, a series of algorithms such as Invariant Risk Minimization (IRM) has been proposed for domain generalization. However, Rosenfeld et al. (2021) shows that in a simple linear data model, even if non-convexity issues are ignored, IRM and its extensions cannot generalize to unseen environments with less than $d_s+1$ training environments, where $d_s$ is the dimension of the spurious-feature subspace. In this paper, we propose to achieve domain generalization with Invariant-feature Subspace Recovery (ISR). Our first algorithm, ISR-Mean, can identify the subspace spanned by invariant features from the first-order moments of the class-conditional distributions, and achieve provable domain generalization with $d_s+1$ training environments under the data model of Rosenfeld et al. (2021). Our second algorithm, ISR-Cov, further reduces the required number of training environments to $O(1)$ using the information of second-order moments. Notably, unlike IRM, our algorithms bypass non-convexity issues and enjoy global convergence guarantees. Empirically, our ISRs can obtain superior performance compared with IRM on synthetic benchmarks. In addition, on three real-world image and text datasets, we show that both ISRs can be used as simple yet effective post-processing methods to improve the worst-case accuracy of (pre-)trained models against spurious correlations and group shifts.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22x/wang22x.pdf", "supp": "", "pdf_size": 672789, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16846223791215545357&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign", "aff_domain": "illinois.edu; ; ; ", "email": "illinois.edu; ; ; ", "github": "https://github.com/Haoxiang-Wang/ISR", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/wang22x.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Provable Reinforcement Learning with a Short-Term Memory", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17201", "id": "17201", "proceeding": "https://proceedings.mlr.press/v162/efroni22a.html", "poster": "", "slides": "", "author_site": "Yonathan Efroni, Chi Jin, Akshay Krishnamurthy, Sobhan Miryoosefi", "author": "Yonathan Efroni; Chi Jin; Akshay Krishnamurthy; Sobhan Miryoosefi", "abstract": "Real-world sequential decision making problems commonly involve partial observability, which requires the agent to maintain a memory of history in order to infer the latent states, plan and make good decisions. Coping with partial observability in general is extremely challenging, as a number of worst-case statistical and computational barriers are known in learning Partially Observable Markov Decision Processes (POMDPs). Motivated by the problem structure in several physical applications, as well as a commonly used technique known as \"frame stacking\", this paper proposes to study a new subclass of POMDPs, whose latent states can be decoded by the most recent history of a short length m. We establish a set of upper and lower bounds on the sample complexity for learning near-optimal policies for this class of problems in both tabular and rich-observation settings (where the number of observations is enormous). In particular, in the rich-observation setting, we develop new algorithms using a novel \"moment matching\" approach with a sample complexity that scales exponentially with the short length m rather than the problem horizon, and is independent of the number of observations. Our results show that a short-term memory suffices for reinforcement learning in these environments.", "bibtex": "@InProceedings{pmlr-v162-efroni22a,\n title = \t {Provable Reinforcement Learning with a Short-Term Memory},\n author = {Efroni, Yonathan and Jin, Chi and Krishnamurthy, Akshay and Miryoosefi, Sobhan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5832--5850},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/efroni22a/efroni22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/efroni22a.html},\n abstract = \t {Real-world sequential decision making problems commonly involve partial observability, which requires the agent to maintain a memory of history in order to infer the latent states, plan and make good decisions. Coping with partial observability in general is extremely challenging, as a number of worst-case statistical and computational barriers are known in learning Partially Observable Markov Decision Processes (POMDPs). Motivated by the problem structure in several physical applications, as well as a commonly used technique known as \"frame stacking\", this paper proposes to study a new subclass of POMDPs, whose latent states can be decoded by the most recent history of a short length m. We establish a set of upper and lower bounds on the sample complexity for learning near-optimal policies for this class of problems in both tabular and rich-observation settings (where the number of observations is enormous). In particular, in the rich-observation setting, we develop new algorithms using a novel \"moment matching\" approach with a sample complexity that scales exponentially with the short length m rather than the problem horizon, and is independent of the number of observations. Our results show that a short-term memory suffices for reinforcement learning in these environments.}\n}", "pdf": "https://proceedings.mlr.press/v162/efroni22a/efroni22a.pdf", "supp": "", "pdf_size": 1217965, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8243635031742517245&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Microsoft Research, New York; Princeton University; Microsoft Research, New York; Princeton University", "aff_domain": "microsoft.com;cs.princeton.edu;microsoft.com;cs.princeton.edu", "email": "microsoft.com;cs.princeton.edu;microsoft.com;cs.princeton.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/efroni22a.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Microsoft;Princeton University", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.princeton.edu", "aff_unique_abbr": "MSR;Princeton", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New York;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Provable Stochastic Optimization for Global Contrastive Learning: Small Batch Does Not Harm Performance", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16561", "id": "16561", "proceeding": "https://proceedings.mlr.press/v162/yuan22b.html", "poster": "/media/PosterPDFs/ICML%202022/46489c17893dfdcf028883202cefd6d1_3jty6P0.png?t=1658378106.0816479", "slides": "/media/icml-2022/Slides/16561_Cz2BnXM.pdf", "author_site": "Zhuoning Yuan, Yuexin Wu, Zi-Hao Qiu, Xianzhi Du, Lijun Zhang, Denny Zhou, Tianbao Yang", "author": "Zhuoning Yuan; Yuexin Wu; Zi-Hao Qiu; Xianzhi Du; Lijun Zhang; Denny Zhou; Tianbao Yang", "abstract": "In this paper, we study contrastive learning from an optimization perspective, aiming to analyze and address a fundamental issue of existing contrastive learning methods that either rely on a large batch size or a large dictionary of feature vectors. We consider a global objective for contrastive learning, which contrasts each positive pair with all negative pairs for an anchor point. From the optimization perspective, we explain why existing methods such as SimCLR require a large batch size in order to achieve a satisfactory result. In order to remove such requirement, we propose a memory-efficient Stochastic Optimization algorithm for solving the Global objective of Contrastive Learning of Representations, named SogCLR. We show that its optimization error is negligible under a reasonable condition after a sufficient number of iterations or is diminishing for a slightly different global contrastive objective. Empirically, we demonstrate that SogCLR with small batch size (e.g., 256) can achieve similar performance as SimCLR with large batch size (e.g., 8192) on self-supervised learning task on ImageNet-1K. We also attempt to show that the proposed optimization technique is generic and can be applied to solving other contrastive losses, e.g., two-way contrastive losses for bimodal contrastive learning. The proposed method is implemented in our open-sourced library LibAUC (www.libauc.org).", "bibtex": "@InProceedings{pmlr-v162-yuan22b,\n title = \t {Provable Stochastic Optimization for Global Contrastive Learning: Small Batch Does Not Harm Performance},\n author = {Yuan, Zhuoning and Wu, Yuexin and Qiu, Zi-Hao and Du, Xianzhi and Zhang, Lijun and Zhou, Denny and Yang, Tianbao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25760--25782},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yuan22b/yuan22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/yuan22b.html},\n abstract = \t {In this paper, we study contrastive learning from an optimization perspective, aiming to analyze and address a fundamental issue of existing contrastive learning methods that either rely on a large batch size or a large dictionary of feature vectors. We consider a global objective for contrastive learning, which contrasts each positive pair with all negative pairs for an anchor point. From the optimization perspective, we explain why existing methods such as SimCLR require a large batch size in order to achieve a satisfactory result. In order to remove such requirement, we propose a memory-efficient Stochastic Optimization algorithm for solving the Global objective of Contrastive Learning of Representations, named SogCLR. We show that its optimization error is negligible under a reasonable condition after a sufficient number of iterations or is diminishing for a slightly different global contrastive objective. Empirically, we demonstrate that SogCLR with small batch size (e.g., 256) can achieve similar performance as SimCLR with large batch size (e.g., 8192) on self-supervised learning task on ImageNet-1K. We also attempt to show that the proposed optimization technique is generic and can be applied to solving other contrastive losses, e.g., two-way contrastive losses for bimodal contrastive learning. The proposed method is implemented in our open-sourced library LibAUC (www.libauc.org).}\n}", "pdf": "https://proceedings.mlr.press/v162/yuan22b/yuan22b.pdf", "supp": "", "pdf_size": 2117733, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15134859990556719420&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Iowa; Google Research; Nanjing University; Google Research; Nanjing University; Google Research; University of Iowa", "aff_domain": "uiowa.edu; ; ; ; ; ;uiowa.edu", "email": "uiowa.edu; ; ; ; ; ;uiowa.edu", "github": "", "project": "www.libauc.org", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/yuan22b.html", "aff_unique_index": "0;1;2;1;2;1;0", "aff_unique_norm": "University of Iowa;Google;Nanjing University", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.uiowa.edu;https://research.google;https://www.nju.edu.cn", "aff_unique_abbr": "UIowa;Google Research;Nanjing U", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "Provably Adversarially Robust Nearest Prototype Classifiers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17803", "id": "17803", "proceeding": "https://proceedings.mlr.press/v162/voracek22a.html", "poster": "/media/PosterPDFs/ICML%202022/358c850b3836ae02b1d8b319d86d435f.png?t=1657916731.3795047", "slides": "", "author_site": "V\u00e1clav Vor\u00e1\u010dek, Matthias Hein", "author": "V\u00e1clav Vor\u00e1\u010dek; Matthias Hein", "abstract": "Nearest prototype classifiers (NPCs) assign to each input point the label of the nearest prototype with respect to a chosen distance metric. A direct advantage of NPCs is that the decisions are interpretable. Previous work could provide lower bounds on the minimal adversarial perturbation in the $\\ell_p$-threat model when using the same $\\ell_p$-distance for the NPCs. In this paper we provide a complete discussion on the complexity when using $\\ell_p$-distances for decision and $\\ell_q$-threat models for certification for $p,q \\in \\{1,2,\\infty\\}$. In particular we provide scalable algorithms for the", "bibtex": "@InProceedings{pmlr-v162-voracek22a,\n title = \t {Provably Adversarially Robust Nearest Prototype Classifiers},\n author = {Vor{\\'a}{\\v{c}}ek, V{\\'a}clav and Hein, Matthias},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22361--22383},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/voracek22a/voracek22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/voracek22a.html},\n abstract = \t {Nearest prototype classifiers (NPCs) assign to each input point the label of the nearest prototype with respect to a chosen distance metric. A direct advantage of NPCs is that the decisions are interpretable. Previous work could provide lower bounds on the minimal adversarial perturbation in the $\\ell_p$-threat model when using the same $\\ell_p$-distance for the NPCs. In this paper we provide a complete discussion on the complexity when using $\\ell_p$-distances for decision and $\\ell_q$-threat models for certification for $p,q \\in \\{1,2,\\infty\\}$. In particular we provide scalable algorithms for the", "pdf": "https://proceedings.mlr.press/v162/voracek22a/voracek22a.pdf", "supp": "", "pdf_size": 795249, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12783036933914721155&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of T \u00a8ubingen, Germany; University of T \u00a8ubingen, Germany", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de", "email": "uni-tuebingen.de;uni-tuebingen.de", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/voracek22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of T\u00fcbingen", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Provably Efficient Offline Reinforcement Learning for Partially Observable Markov Decision Processes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17393", "id": "17393", "proceeding": "https://proceedings.mlr.press/v162/guo22a.html", "poster": "/media/PosterPDFs/ICML%202022/df3aebc649f9e3b674eeb790a4da224e.png?t=1657260009.4791234", "slides": "", "author_site": "Hongyi Guo, Qi Cai, Yufeng Zhang, Zhuoran Yang, Zhaoran Wang", "author": "Hongyi Guo; Qi Cai; Yufeng Zhang; Zhuoran Yang; Zhaoran Wang", "abstract": "We study offline reinforcement learning (RL) for partially observable Markov decision processes (POMDPs) with possibly infinite state and observation spaces. Under the undercompleteness assumption, the optimal policy in such POMDPs are characterized by a class of finite-memory Bellman operators. In the offline setting, estimating these operators directly is challenging due to (i) the large observation space and (ii) insufficient coverage of the offline dataset. To tackle these challenges, we propose a novel algorithm that constructs confidence regions for these Bellman operators via offline estimation of their RKHS embeddings, and returns the final policy via pessimistic planning within the confidence regions. We prove that the proposed algorithm attains an \\(\\epsilon\\)-optimal policy using an offline dataset containing \\(\\tilde\\cO(1 / \\epsilon^2)\\){episodes}, provided that the behavior policy has good coverage over the optimal trajectory. To our best knowledge, our algorithm is the first provably sample efficient offline algorithm for POMDPs without uniform coverage assumptions.", "bibtex": "@InProceedings{pmlr-v162-guo22a,\n title = \t {Provably Efficient Offline Reinforcement Learning for Partially Observable {M}arkov Decision Processes},\n author = {Guo, Hongyi and Cai, Qi and Zhang, Yufeng and Yang, Zhuoran and Wang, Zhaoran},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8016--8038},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guo22a/guo22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/guo22a.html},\n abstract = \t {We study offline reinforcement learning (RL) for partially observable Markov decision processes (POMDPs) with possibly infinite state and observation spaces. Under the undercompleteness assumption, the optimal policy in such POMDPs are characterized by a class of finite-memory Bellman operators. In the offline setting, estimating these operators directly is challenging due to (i) the large observation space and (ii) insufficient coverage of the offline dataset. To tackle these challenges, we propose a novel algorithm that constructs confidence regions for these Bellman operators via offline estimation of their RKHS embeddings, and returns the final policy via pessimistic planning within the confidence regions. We prove that the proposed algorithm attains an \\(\\epsilon\\)-optimal policy using an offline dataset containing \\(\\tilde\\cO(1 / \\epsilon^2)\\){episodes}, provided that the behavior policy has good coverage over the optimal trajectory. To our best knowledge, our algorithm is the first provably sample efficient offline algorithm for POMDPs without uniform coverage assumptions.}\n}", "pdf": "https://proceedings.mlr.press/v162/guo22a/guo22a.pdf", "supp": "", "pdf_size": 407068, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14176907373158701054&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Industrial Engineering and Management Sciences, Northwestern University; Department of Industrial Engineering and Management Sciences, Northwestern University; Department of Industrial Engineering and Management Sciences, Northwestern University; Department of Statistics and Data Science, Yale University; Department of Industrial Engineering and Management Sciences, Northwestern University", "aff_domain": "u.northwestern.edu;u.northwestern.edu;u.northwestern.edu;yale.edu;gmail.com", "email": "u.northwestern.edu;u.northwestern.edu;u.northwestern.edu;yale.edu;gmail.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/guo22a.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Northwestern University;Yale University", "aff_unique_dep": "Department of Industrial Engineering and Management Sciences;Department of Statistics and Data Science", "aff_unique_url": "https://www.northwestern.edu;https://www.yale.edu", "aff_unique_abbr": "NU;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Proving Theorems using Incremental Learning and Hindsight Experience Replay", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18097", "id": "18097", "proceeding": "https://proceedings.mlr.press/v162/aygun22a.html", "poster": "/media/PosterPDFs/ICML%202022/412758d043dd247bddea07c7ec558c31_32BjMl3.png?t=1657624838.3949347", "slides": "", "author_site": "Eser Ayg\u00fcn, Ankit Anand, Laurent Orseau, Xavier Glorot, Stephen McAleer, Vlad Firoiu, Lei Zhang, Doina Precup, Shibl Mourad", "author": "Eser Ayg\u00fcn; Ankit Anand; Laurent Orseau; Xavier Glorot; Stephen M Mcaleer; Vlad Firoiu; Lei M Zhang; Doina Precup; Shibl Mourad", "abstract": "Traditional automated theorem proving systems for first-order logic depend on speed-optimized search and many handcrafted heuristics designed to work over a wide range of domains. Machine learning approaches in the literature either depend on these traditional provers to bootstrap themselves, by leveraging these heuristics, or can struggle due to limited existing proof data. The latter issue can be explained by the lack of a smooth difficulty gradient in theorem proving datasets; large gaps in difficulty between different theorems can make training harder or even impossible. In this paper, we adapt the idea of hindsight experience replay from reinforcement learning to the automated theorem proving domain, so as to use the intermediate data generated during unsuccessful proof attempts. We build a first-order logic prover by disabling all the smart clause-scoring heuristics of the state-of-the-art E prover and replacing them with a clause-scoring neural network learned by using hindsight experience replay in an incremental learning setting. Clauses are represented as graphs and presented to transformer networks with spectral features. We show that provers trained in this way can outperform previous machine learning approaches and compete with the state of the art heuristic-based theorem prover E in its best configuration, on the popular benchmarks MPTP2078, M2k and Mizar40. The proofs generated by our algorithm are also almost always significantly shorter than E\u2019s proofs.", "bibtex": "@InProceedings{pmlr-v162-aygun22a,\n title = \t {Proving Theorems using Incremental Learning and Hindsight Experience Replay},\n author = {Ayg{\\\"u}n, Eser and Anand, Ankit and Orseau, Laurent and Glorot, Xavier and Mcaleer, Stephen M and Firoiu, Vlad and Zhang, Lei M and Precup, Doina and Mourad, Shibl},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1198--1210},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/aygun22a/aygun22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/aygun22a.html},\n abstract = \t {Traditional automated theorem proving systems for first-order logic depend on speed-optimized search and many handcrafted heuristics designed to work over a wide range of domains. Machine learning approaches in the literature either depend on these traditional provers to bootstrap themselves, by leveraging these heuristics, or can struggle due to limited existing proof data. The latter issue can be explained by the lack of a smooth difficulty gradient in theorem proving datasets; large gaps in difficulty between different theorems can make training harder or even impossible. In this paper, we adapt the idea of hindsight experience replay from reinforcement learning to the automated theorem proving domain, so as to use the intermediate data generated during unsuccessful proof attempts. We build a first-order logic prover by disabling all the smart clause-scoring heuristics of the state-of-the-art E prover and replacing them with a clause-scoring neural network learned by using hindsight experience replay in an incremental learning setting. Clauses are represented as graphs and presented to transformer networks with spectral features. We show that provers trained in this way can outperform previous machine learning approaches and compete with the state of the art heuristic-based theorem prover E in its best configuration, on the popular benchmarks MPTP2078, M2k and Mizar40. The proofs generated by our algorithm are also almost always significantly shorter than E\u2019s proofs.}\n}", "pdf": "https://proceedings.mlr.press/v162/aygun22a/aygun22a.pdf", "supp": "", "pdf_size": 427302, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6014838924852806153&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "DeepMind; DeepMind; DeepMind; DeepMind; DeepMind + Department of Information and Computer Science, University of California, Irvine; DeepMind; DeepMind; DeepMind + Mila - Quebec AI Institute + McGill University; DeepMind", "aff_domain": "deepmind.com;deepmind.com;deepmind.com; ; ; ; ; ; ", "email": "deepmind.com;deepmind.com;deepmind.com; ; ; ; ; ; ", "github": "", "project": "", "author_num": 9, "oa": "https://proceedings.mlr.press/v162/aygun22a.html", "aff_unique_index": "0;0;0;0;0+1;0;0;0+2+3;0", "aff_unique_norm": "DeepMind;University of California, Irvine;Quebec AI Institute;McGill University", "aff_unique_dep": ";Department of Information and Computer Science;AI Institute;", "aff_unique_url": "https://deepmind.com;https://www.uci.edu;https://mila.quebec;https://www.mcgill.ca", "aff_unique_abbr": "DeepMind;UCI;Mila;McGill", "aff_campus_unique_index": "1;", "aff_campus_unique": ";Irvine", "aff_country_unique_index": "0;0;0;0;0+1;0;0;0+2+2;0", "aff_country_unique": "United Kingdom;United States;Canada" }, { "title": "ProxSkip: Yes! Local Gradient Steps Provably Lead to Communication Acceleration! Finally!", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18053", "id": "18053", "proceeding": "https://proceedings.mlr.press/v162/mishchenko22b.html", "poster": "", "slides": "", "author_site": "Konstantin Mishchenko, Grigory Malinovsky, Sebastian Stich, Peter Richtarik", "author": "Konstantin Mishchenko; Grigory Malinovsky; Sebastian Stich; Peter Richtarik", "abstract": "We introduce ProxSkip\u2014a surprisingly simple and provably efficient method for minimizing the sum of a smooth ($f$) and an expensive nonsmooth proximable ($\\psi$) function. The canonical approach to solving such problems is via the proximal gradient descent (ProxGD) algorithm, which is based on the evaluation of the gradient of $f$ and the prox operator of $\\psi$ in each iteration. In this work we are specifically interested in the regime in which the evaluation of prox is costly relative to the evaluation of the gradient, which is the case in many applications. ProxSkip allows for the expensive prox operator to be skipped in most iterations: while its iteration complexity is $\\mathcal{O}(\\kappa \\log \\nicefrac{1}{\\varepsilon})$, where $\\kappa$ is the condition number of $f$, the number of prox evaluations is $\\mathcal{O}(\\sqrt{\\kappa} \\log \\nicefrac{1}{\\varepsilon})$ only. Our main motivation comes from federated learning, where evaluation of the gradient operator corresponds to taking a local GD step independently on all devices, and evaluation of prox corresponds to (expensive) communication in the form of gradient averaging. In this context, ProxSkip offers an effective", "bibtex": "@InProceedings{pmlr-v162-mishchenko22b,\n title = \t {{P}rox{S}kip: Yes! {L}ocal Gradient Steps Provably Lead to Communication Acceleration! {F}inally!},\n author = {Mishchenko, Konstantin and Malinovsky, Grigory and Stich, Sebastian and Richtarik, Peter},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15750--15769},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mishchenko22b/mishchenko22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/mishchenko22b.html},\n abstract = \t {We introduce ProxSkip\u2014a surprisingly simple and provably efficient method for minimizing the sum of a smooth ($f$) and an expensive nonsmooth proximable ($\\psi$) function. The canonical approach to solving such problems is via the proximal gradient descent (ProxGD) algorithm, which is based on the evaluation of the gradient of $f$ and the prox operator of $\\psi$ in each iteration. In this work we are specifically interested in the regime in which the evaluation of prox is costly relative to the evaluation of the gradient, which is the case in many applications. ProxSkip allows for the expensive prox operator to be skipped in most iterations: while its iteration complexity is $\\mathcal{O}(\\kappa \\log \\nicefrac{1}{\\varepsilon})$, where $\\kappa$ is the condition number of $f$, the number of prox evaluations is $\\mathcal{O}(\\sqrt{\\kappa} \\log \\nicefrac{1}{\\varepsilon})$ only. Our main motivation comes from federated learning, where evaluation of the gradient operator corresponds to taking a local GD step independently on all devices, and evaluation of prox corresponds to (expensive) communication in the form of gradient averaging. In this context, ProxSkip offers an effective", "pdf": "https://proceedings.mlr.press/v162/mishchenko22b/mishchenko22b.pdf", "supp": "", "pdf_size": 987859, "gs_citation": 192, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10636796754462948775&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "CNRS, ENS, Inria Sierra, Paris, France; Computer Science, King Abdullah University of Science and Technology, Thuwal, Saudi Arabia; CISPA Helmholtz Center for Information Security, Saarbr\u00fccken, Germany; Computer Science, King Abdullah University of Science and Technology, Thuwal, Saudi Arabia", "aff_domain": "inria.fr;kaust.edu.sa;cispa.de;kaust.edu.sa", "email": "inria.fr;kaust.edu.sa;cispa.de;kaust.edu.sa", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/mishchenko22b.html", "aff_unique_index": "0;1;2;1", "aff_unique_norm": "CNRS;King Abdullah University of Science and Technology;CISPA Helmholtz Center for Information Security", "aff_unique_dep": ";Computer Science;", "aff_unique_url": "https://www.cnrs.fr;https://www.kast.kau.edu.sa;https://www.cispa.de", "aff_unique_abbr": "CNRS;KAUST;CISPA", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Thuwal;Saarbr\u00fccken", "aff_country_unique_index": "0;1;2;1", "aff_country_unique": "France;Saudi Arabia;Germany" }, { "title": "Proximal Denoiser for Convergent Plug-and-Play Optimization with Nonconvex Regularization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18135", "id": "18135", "proceeding": "https://proceedings.mlr.press/v162/hurault22a.html", "poster": "/media/PosterPDFs/ICML%202022/a5d42e4024cc540befb48f466820e25f.png?t=1657814169.9576764", "slides": "/media/icml-2022/Slides/18135.pdf", "author_site": "Samuel Hurault, Arthur Leclaire, Nicolas Papadakis", "author": "Samuel Hurault; Arthur Leclaire; Nicolas Papadakis", "abstract": "Plug-and-Play (PnP) methods solve ill-posed inverse problems through iterative proximal algorithms by replacing a proximal operator by a denoising operation. When applied with deep neural network denoisers, these methods have shown state-of-the-art visual performance for image restoration problems. However, their theoretical convergence analysis is still incomplete. Most of the existing convergence results consider nonexpansive denoisers, which is non-realistic, or limit their analysis to strongly convex data-fidelity terms in the inverse problem to solve. Recently, it was proposed to train the denoiser as a gradient descent step on a functional parameterized by a deep neural network. Using such a denoiser guarantees the convergence of the PnP version of the Half-Quadratic-Splitting (PnP-HQS) iterative algorithm. In this paper, we show that this gradient denoiser can actually correspond to the proximal operator of another scalar function. Given this new result, we exploit the convergence theory of proximal algorithms in the nonconvex setting to obtain convergence results for PnP-PGD (Proximal Gradient Descent) and PnP-ADMM (Alternating Direction Method of Multipliers). When built on top of a smooth gradient denoiser, we show that PnP-PGD and PnP-ADMM are convergent and target stationary points of an explicit functional. These convergence results are confirmed with numerical experiments on deblurring, super-resolution and inpainting.", "bibtex": "@InProceedings{pmlr-v162-hurault22a,\n title = \t {Proximal Denoiser for Convergent Plug-and-Play Optimization with Nonconvex Regularization},\n author = {Hurault, Samuel and Leclaire, Arthur and Papadakis, Nicolas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9483--9505},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hurault22a/hurault22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hurault22a.html},\n abstract = \t {Plug-and-Play (PnP) methods solve ill-posed inverse problems through iterative proximal algorithms by replacing a proximal operator by a denoising operation. When applied with deep neural network denoisers, these methods have shown state-of-the-art visual performance for image restoration problems. However, their theoretical convergence analysis is still incomplete. Most of the existing convergence results consider nonexpansive denoisers, which is non-realistic, or limit their analysis to strongly convex data-fidelity terms in the inverse problem to solve. Recently, it was proposed to train the denoiser as a gradient descent step on a functional parameterized by a deep neural network. Using such a denoiser guarantees the convergence of the PnP version of the Half-Quadratic-Splitting (PnP-HQS) iterative algorithm. In this paper, we show that this gradient denoiser can actually correspond to the proximal operator of another scalar function. Given this new result, we exploit the convergence theory of proximal algorithms in the nonconvex setting to obtain convergence results for PnP-PGD (Proximal Gradient Descent) and PnP-ADMM (Alternating Direction Method of Multipliers). When built on top of a smooth gradient denoiser, we show that PnP-PGD and PnP-ADMM are convergent and target stationary points of an explicit functional. These convergence results are confirmed with numerical experiments on deblurring, super-resolution and inpainting.}\n}", "pdf": "https://proceedings.mlr.press/v162/hurault22a/hurault22a.pdf", "supp": "", "pdf_size": 6352614, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12256965087281375600&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Univ. Bordeaux, Bordeaux INP, CNRS, IMB, UMR 5251, F-33400 Talence, France; Univ. Bordeaux, Bordeaux INP, CNRS, IMB, UMR 5251, F-33400 Talence, France; Univ. Bordeaux, Bordeaux INP, CNRS, IMB, UMR 5251, F-33400 Talence, France", "aff_domain": "math.u-bordeaux.fr; ; ", "email": "math.u-bordeaux.fr; ; ", "github": "https://github.com/samuro95/Prox-PnP", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/hurault22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Bordeaux", "aff_unique_dep": "Institut de Math\u00e9matiques de Bordeaux (IMB)", "aff_unique_url": "https://www.univ-bordeaux.fr", "aff_unique_abbr": "Univ. Bordeaux", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Talence", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Proximal Exploration for Model-guided Protein Sequence Design", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17763", "id": "17763", "proceeding": "https://proceedings.mlr.press/v162/ren22a.html", "poster": "/media/PosterPDFs/ICML%202022/9f96f36b7aae3b1ff847c26ac94c604e.png?t=1657186273.0849366", "slides": "", "author_site": "Zhizhou Ren, Jiahan Li, Fan Ding, Yuan Zhou, Jianzhu Ma, Jian Peng", "author": "Zhizhou Ren; Jiahan Li; Fan Ding; Yuan Zhou; Jianzhu Ma; Jian Peng", "abstract": "Designing protein sequences with a particular biological function is a long-lasting challenge for protein engineering. Recent advances in machine-learning-guided approaches focus on building a surrogate sequence-function model to reduce the burden of expensive in-lab experiments. In this paper, we study the exploration mechanism of model-guided sequence design. We leverage a natural property of protein fitness landscape that a concise set of mutations upon the wild-type sequence are usually sufficient to enhance the desired function. By utilizing this property, we propose Proximal Exploration (PEX) algorithm that prioritizes the evolutionary search for high-fitness mutants with low mutation counts. In addition, we develop a specialized model architecture, called Mutation Factorization Network (MuFacNet), to predict low-order mutational effects, which further improves the sample efficiency of model-guided evolution. In experiments, we extensively evaluate our method on a suite of in-silico protein sequence design tasks and demonstrate substantial improvement over baseline algorithms.", "bibtex": "@InProceedings{pmlr-v162-ren22a,\n title = \t {Proximal Exploration for Model-guided Protein Sequence Design},\n author = {Ren, Zhizhou and Li, Jiahan and Ding, Fan and Zhou, Yuan and Ma, Jianzhu and Peng, Jian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18520--18536},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ren22a/ren22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ren22a.html},\n abstract = \t {Designing protein sequences with a particular biological function is a long-lasting challenge for protein engineering. Recent advances in machine-learning-guided approaches focus on building a surrogate sequence-function model to reduce the burden of expensive in-lab experiments. In this paper, we study the exploration mechanism of model-guided sequence design. We leverage a natural property of protein fitness landscape that a concise set of mutations upon the wild-type sequence are usually sufficient to enhance the desired function. By utilizing this property, we propose Proximal Exploration (PEX) algorithm that prioritizes the evolutionary search for high-fitness mutants with low mutation counts. In addition, we develop a specialized model architecture, called Mutation Factorization Network (MuFacNet), to predict low-order mutational effects, which further improves the sample efficiency of model-guided evolution. In experiments, we extensively evaluate our method on a suite of in-silico protein sequence design tasks and demonstrate substantial improvement over baseline algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/ren22a/ren22a.pdf", "supp": "", "pdf_size": 1772072, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13916824899355804754&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "HeliXon Limited+Department of Computer Science, University of Illinois at Urbana-Champaign; HeliXon Limited+Institute for Artificial Intelligence, Peking University; HeliXon Limited; Yau Mathematical Sciences Center, Tsinghua University; Institute for Artificial Intelligence, Peking University; Department of Computer Science, University of Illinois at Urbana-Champaign+Institute for Industry AI Research, Tsinghua University", "aff_domain": "illinois.edu; ; ; ; ;illinois.edu", "email": "illinois.edu; ; ; ; ;illinois.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/ren22a.html", "aff_unique_index": "0+1;0+2;0;3;2;1+3", "aff_unique_norm": "HeliXon Limited;University of Illinois Urbana-Champaign;Peking University;Tsinghua University", "aff_unique_dep": ";Department of Computer Science;Institute for Artificial Intelligence;Yau Mathematical Sciences Center", "aff_unique_url": ";https://illinois.edu;http://www.pku.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": ";UIUC;PKU;THU", "aff_campus_unique_index": "1;;1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0+1;0+2;0;2;2;1+2", "aff_country_unique": "United Kingdom;United States;China" }, { "title": "Proximal and Federated Random Reshuffling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16515", "id": "16515", "proceeding": "https://proceedings.mlr.press/v162/mishchenko22a.html", "poster": "", "slides": "", "author_site": "Konstantin Mishchenko, Ahmed Khaled, Peter Richtarik", "author": "Konstantin Mishchenko; Ahmed Khaled; Peter Richtarik", "abstract": "Random Reshuffling (RR), also known as Stochastic Gradient Descent (SGD) without replacement, is a popular and theoretically grounded method for finite-sum minimization. We propose two new algorithms: Proximal and Federated Random Reshuffling (ProxRR and FedRR). The first algorithm, ProxRR, solves composite finite-sum minimization problems in which the objective is the sum of a (potentially non-smooth) convex regularizer and an average of $n$ smooth objectives. ProxRR evaluates the proximal operator once per epoch only. When the proximal operator is expensive to compute, this small difference makes ProxRR up to $n$ times faster than algorithms that evaluate the proximal operator in every iteration, such as proximal (stochastic) gradient descent. We give examples of practical optimization tasks where the proximal operator is difficult to compute and ProxRR has a clear advantage. One such task is federated or distributed optimization, where the evaluation of the proximal operator corresponds to communication across the network. We obtain our second algorithm, FedRR, as a special case of ProxRR applied to federated optimization, and prove it has a smaller communication footprint than either distributed gradient descent or Local SGD. Our theory covers both constant and decreasing stepsizes, and allows for importance resampling schemes that can improve conditioning, which may be of independent interest. Our theory covers both convex and nonconvex regimes. Finally, we corroborate our results with experiments on real data sets.", "bibtex": "@InProceedings{pmlr-v162-mishchenko22a,\n title = \t {Proximal and Federated Random Reshuffling},\n author = {Mishchenko, Konstantin and Khaled, Ahmed and Richtarik, Peter},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15718--15749},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mishchenko22a/mishchenko22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mishchenko22a.html},\n abstract = \t {Random Reshuffling (RR), also known as Stochastic Gradient Descent (SGD) without replacement, is a popular and theoretically grounded method for finite-sum minimization. We propose two new algorithms: Proximal and Federated Random Reshuffling (ProxRR and FedRR). The first algorithm, ProxRR, solves composite finite-sum minimization problems in which the objective is the sum of a (potentially non-smooth) convex regularizer and an average of $n$ smooth objectives. ProxRR evaluates the proximal operator once per epoch only. When the proximal operator is expensive to compute, this small difference makes ProxRR up to $n$ times faster than algorithms that evaluate the proximal operator in every iteration, such as proximal (stochastic) gradient descent. We give examples of practical optimization tasks where the proximal operator is difficult to compute and ProxRR has a clear advantage. One such task is federated or distributed optimization, where the evaluation of the proximal operator corresponds to communication across the network. We obtain our second algorithm, FedRR, as a special case of ProxRR applied to federated optimization, and prove it has a smaller communication footprint than either distributed gradient descent or Local SGD. Our theory covers both constant and decreasing stepsizes, and allows for importance resampling schemes that can improve conditioning, which may be of independent interest. Our theory covers both convex and nonconvex regimes. Finally, we corroborate our results with experiments on real data sets.}\n}", "pdf": "https://proceedings.mlr.press/v162/mishchenko22a/mishchenko22a.pdf", "supp": "", "pdf_size": 3313177, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4410848419822485671&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "CNRS, DI ENS, Inria; Princeton University; KAUST", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/mishchenko22a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "CNRS;Princeton University;King Abdullah University of Science and Technology", "aff_unique_dep": "DI ENS;;", "aff_unique_url": "https://www.cnrs.fr;https://www.princeton.edu;https://www.kaust.edu.sa", "aff_unique_abbr": "CNRS;Princeton;KAUST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "France;United States;Saudi Arabia" }, { "title": "Public Data-Assisted Mirror Descent for Private Model Training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16901", "id": "16901", "proceeding": "https://proceedings.mlr.press/v162/amid22a.html", "poster": "/media/PosterPDFs/ICML%202022/e6c2dc3dee4a51dcec3a876aa2339a78.png?t=1658250718.9749303", "slides": "", "author_site": "Ehsan Amid, Arun Ganesh, Rajiv Mathews, Swaroop Ramaswamy, Shuang Song, Thomas Steinke, Thomas Steinke, Vinith Suriyakumar, Om Thakkar, Abhradeep Guha Thakurta", "author": "Ehsan Amid; Arun Ganesh; Rajiv Mathews; Swaroop Ramaswamy; Shuang Song; Thomas Steinke; Thomas Steinke; Vinith M Suriyakumar; Om Thakkar; Abhradeep Thakurta", "abstract": "In this paper, we revisit the problem of using in-distribution public data to improve the privacy/utility trade-offs for differentially private (DP) model training. (Here, public data refers to auxiliary data sets that have no privacy concerns.) We design a natural variant of DP mirror descent, where the DP gradients of the private/sensitive data act as the linear term, and the loss generated by the public data as the mirror map. We show that, for linear regression with feature vectors drawn from a non-isotropic sub-Gaussian distribution, our algorithm, PDA-DPMD (a variant of mirror descent), provides population risk guarantees that are asymptotically better than the best known guarantees under DP (without having access to public data), when the number of public data samples is sufficiently large. We further show that our algorithm has natural \u201cnoise stability\u201d properties that control the variance due to noise added to ensure DP. We demonstrate the efficacy of our algorithm by showing privacy/utility trade-offs on four benchmark datasets (StackOverflow, WikiText-2, CIFAR-10, and EMNIST). We show that our algorithm not only significantly improves over traditional DP-SGD, which does not have access to public data, but to our knowledge is the first to improve over DP-SGD on models that have been pre-trained with public data.", "bibtex": "@InProceedings{pmlr-v162-amid22a,\n title = \t {Public Data-Assisted Mirror Descent for Private Model Training},\n author = {Amid, Ehsan and Ganesh, Arun and Mathews, Rajiv and Ramaswamy, Swaroop and Song, Shuang and Steinke, Thomas and Steinke, Thomas and Suriyakumar, Vinith M and Thakkar, Om and Thakurta, Abhradeep},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {517--535},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/amid22a/amid22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/amid22a.html},\n abstract = \t {In this paper, we revisit the problem of using in-distribution public data to improve the privacy/utility trade-offs for differentially private (DP) model training. (Here, public data refers to auxiliary data sets that have no privacy concerns.) We design a natural variant of DP mirror descent, where the DP gradients of the private/sensitive data act as the linear term, and the loss generated by the public data as the mirror map. We show that, for linear regression with feature vectors drawn from a non-isotropic sub-Gaussian distribution, our algorithm, PDA-DPMD (a variant of mirror descent), provides population risk guarantees that are asymptotically better than the best known guarantees under DP (without having access to public data), when the number of public data samples is sufficiently large. We further show that our algorithm has natural \u201cnoise stability\u201d properties that control the variance due to noise added to ensure DP. We demonstrate the efficacy of our algorithm by showing privacy/utility trade-offs on four benchmark datasets (StackOverflow, WikiText-2, CIFAR-10, and EMNIST). We show that our algorithm not only significantly improves over traditional DP-SGD, which does not have access to public data, but to our knowledge is the first to improve over DP-SGD on models that have been pre-trained with public data.}\n}", "pdf": "https://proceedings.mlr.press/v162/amid22a/amid22a.pdf", "supp": "", "pdf_size": 690688, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16311948829499291413&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "email": ";;;;;;;;;", "github": "", "project": "", "author_num": 10, "oa": "https://proceedings.mlr.press/v162/amid22a.html" }, { "title": "Pure Noise to the Rescue of Insufficient Data: Improving Imbalanced Classification by Training on Random Noise Images", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16419", "id": "16419", "proceeding": "https://proceedings.mlr.press/v162/zada22a.html", "poster": "/media/PosterPDFs/ICML%202022/fbd7939d674997cdb4692d34de8633c4_UJAfdRF.png?t=1657204720.4970658", "slides": "", "author_site": "Shiran Zada, Itay Benou, Michal Irani", "author": "Shiran Zada; Itay Benou; Michal Irani", "abstract": "Despite remarkable progress on visual recognition tasks, deep neural-nets still struggle to generalize well when training data is scarce or highly imbalanced, rendering them extremely vulnerable to real-world examples. In this paper, we present a surprisingly simple yet highly effective method to mitigate this limitation: using pure noise images as additional training data. Unlike the common use of additive noise or adversarial noise for data augmentation, we propose an entirely different perspective by directly training on pure random noise images. We present a new Distribution-Aware Routing Batch Normalization layer (DAR-BN), which enables training on pure noise images in addition to natural images within the same network. This encourages generalization and suppresses overfitting. Our proposed method significantly improves imbalanced classification performance, obtaining state-of-the-art results on a large variety of long-tailed image classification datasets (CIFAR-10-LT, CIFAR-100-LT, ImageNet-LT, Places-LT, and CelebA-5). Furthermore, our method is extremely simple and easy to use as a general new augmentation tool (on top of existing augmentations), and can be incorporated in any training scheme. It does not require any specialized data generation or training procedures, thus keeping training fast and efficient.", "bibtex": "@InProceedings{pmlr-v162-zada22a,\n title = \t {Pure Noise to the Rescue of Insufficient Data: Improving Imbalanced Classification by Training on Random Noise Images},\n author = {Zada, Shiran and Benou, Itay and Irani, Michal},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25817--25833},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zada22a/zada22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zada22a.html},\n abstract = \t {Despite remarkable progress on visual recognition tasks, deep neural-nets still struggle to generalize well when training data is scarce or highly imbalanced, rendering them extremely vulnerable to real-world examples. In this paper, we present a surprisingly simple yet highly effective method to mitigate this limitation: using pure noise images as additional training data. Unlike the common use of additive noise or adversarial noise for data augmentation, we propose an entirely different perspective by directly training on pure random noise images. We present a new Distribution-Aware Routing Batch Normalization layer (DAR-BN), which enables training on pure noise images in addition to natural images within the same network. This encourages generalization and suppresses overfitting. Our proposed method significantly improves imbalanced classification performance, obtaining state-of-the-art results on a large variety of long-tailed image classification datasets (CIFAR-10-LT, CIFAR-100-LT, ImageNet-LT, Places-LT, and CelebA-5). Furthermore, our method is extremely simple and easy to use as a general new augmentation tool (on top of existing augmentations), and can be incorporated in any training scheme. It does not require any specialized data generation or training procedures, thus keeping training fast and efficient.}\n}", "pdf": "https://proceedings.mlr.press/v162/zada22a/zada22a.pdf", "supp": "", "pdf_size": 2063150, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13535908408356605995&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science and Applied Mathematics, Weizmann Institute of Science, Rehovot, Israel; Department of Computer Science and Applied Mathematics, Weizmann Institute of Science, Rehovot, Israel; Department of Computer Science and Applied Mathematics, Weizmann Institute of Science, Rehovot, Israel", "aff_domain": "weizmann.ac.il;weizmann.ac.il;weizmann.ac.il", "email": "weizmann.ac.il;weizmann.ac.il;weizmann.ac.il", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zada22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Weizmann Institute of Science", "aff_unique_dep": "Department of Computer Science and Applied Mathematics", "aff_unique_url": "https://www.weizmann.ac.il", "aff_unique_abbr": "Weizmann", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Rehovot", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Israel" }, { "title": "QSFL: A Two-Level Uplink Communication Optimization Framework for Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15967", "id": "15967", "proceeding": "https://proceedings.mlr.press/v162/yi22a.html", "poster": "/media/PosterPDFs/ICML%202022/33e75ff09dd601bbe69f351039152189_yFDZsSW.png?t=1657367632.2077925", "slides": "", "author_site": "Liping Yi, Wang Gang, Liu Xiaoguang", "author": "Liping Yi; Wang Gang; Liu Xiaoguang", "abstract": "In cross-device Federated Learning (FL), the communication cost of transmitting full-precision models between edge devices and a central server is a significant bottleneck, due to expensive, unreliable, and low-bandwidth wireless connections. As a solution, we propose a novel FL framework named QSFL, towards optimizing FL uplink (client-to-server) communication at both client and model levels. At the client level, we design a Qualification Judgment (QJ) algorithm to sample high-qualification clients to upload models. At the model level, we explore a Sparse Cyclic Sliding Segment (SCSS) algorithm to further compress transmitted models. We prove that QSFL can converge over wall-to-wall time, and develop an optimal hyperparameter searching algorithm based on theoretical analysis to enable QSFL to make the best trade-off between model accuracy and communication cost. Experimental results show that QSFL achieves state-of-the-art compression ratios with marginal model accuracy degradation.", "bibtex": "@InProceedings{pmlr-v162-yi22a,\n title = \t {{QSFL}: A Two-Level Uplink Communication Optimization Framework for Federated Learning},\n author = {Yi, Liping and Gang, Wang and Xiaoguang, Liu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25501--25513},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yi22a/yi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yi22a.html},\n abstract = \t {In cross-device Federated Learning (FL), the communication cost of transmitting full-precision models between edge devices and a central server is a significant bottleneck, due to expensive, unreliable, and low-bandwidth wireless connections. As a solution, we propose a novel FL framework named QSFL, towards optimizing FL uplink (client-to-server) communication at both client and model levels. At the client level, we design a Qualification Judgment (QJ) algorithm to sample high-qualification clients to upload models. At the model level, we explore a Sparse Cyclic Sliding Segment (SCSS) algorithm to further compress transmitted models. We prove that QSFL can converge over wall-to-wall time, and develop an optimal hyperparameter searching algorithm based on theoretical analysis to enable QSFL to make the best trade-off between model accuracy and communication cost. Experimental results show that QSFL achieves state-of-the-art compression ratios with marginal model accuracy degradation.}\n}", "pdf": "https://proceedings.mlr.press/v162/yi22a/yi22a.pdf", "supp": "", "pdf_size": 1623974, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4563679524782554750&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Nankai-Orange D.T. Joint Lab, College of Computer Science, Nankai University, Tianjin, China; Nankai-Orange D.T. Joint Lab, College of Computer Science, Nankai University, Tianjin, China; Nankai-Orange D.T. Joint Lab, College of Computer Science, Nankai University, Tianjin, China", "aff_domain": "nbjl.nankai.edu.cn;nbjl.nankai.edu.cn; ", "email": "nbjl.nankai.edu.cn;nbjl.nankai.edu.cn; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/yi22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Nankai University", "aff_unique_dep": "College of Computer Science", "aff_unique_url": "http://www.nankai.edu.cn", "aff_unique_abbr": "Nankai", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Tianjin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Quant-BnB: A Scalable Branch-and-Bound Method for Optimal Decision Trees with Continuous Features", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17099", "id": "17099", "proceeding": "https://proceedings.mlr.press/v162/mazumder22a.html", "poster": "/media/PosterPDFs/ICML%202022/aebf7782a3d445f43cf30ee2c0d84dee_9kq37jb.png?t=1657375994.9258845", "slides": "", "author_site": "Rahul Mazumder, Xiang Meng, Haoyue Wang", "author": "Rahul Mazumder; Xiang Meng; Haoyue Wang", "abstract": "Decision trees are one of the most useful and popular methods in the machine learning toolbox. In this paper, we consider the problem of learning optimal decision trees, a combinatorial optimization problem that is challenging to solve at scale. A common approach in the literature is to use greedy heuristics, which may not be optimal. Recently there has been significant interest in learning optimal decision trees using various approaches (e.g., based on integer programming, dynamic programming)\u2014to achieve computational scalability, most of these approaches focus on classification tasks with binary features. In this paper, we present a new discrete optimization method based on branch-and-bound (BnB) to obtain optimal decision trees. Different from existing customized approaches, we consider both regression and classification tasks with continuous features. The basic idea underlying our approach is to split the search space based on the quantiles of the feature distribution\u2014leading to upper and lower bounds for the underlying optimization problem along the BnB iterations. Our proposed algorithm Quant-BnB shows significant speedups compared to existing approaches for shallow optimal trees on various real datasets.", "bibtex": "@InProceedings{pmlr-v162-mazumder22a,\n title = \t {Quant-{B}n{B}: A Scalable Branch-and-Bound Method for Optimal Decision Trees with Continuous Features},\n author = {Mazumder, Rahul and Meng, Xiang and Wang, Haoyue},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15255--15277},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mazumder22a/mazumder22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mazumder22a.html},\n abstract = \t {Decision trees are one of the most useful and popular methods in the machine learning toolbox. In this paper, we consider the problem of learning optimal decision trees, a combinatorial optimization problem that is challenging to solve at scale. A common approach in the literature is to use greedy heuristics, which may not be optimal. Recently there has been significant interest in learning optimal decision trees using various approaches (e.g., based on integer programming, dynamic programming)\u2014to achieve computational scalability, most of these approaches focus on classification tasks with binary features. In this paper, we present a new discrete optimization method based on branch-and-bound (BnB) to obtain optimal decision trees. Different from existing customized approaches, we consider both regression and classification tasks with continuous features. The basic idea underlying our approach is to split the search space based on the quantiles of the feature distribution\u2014leading to upper and lower bounds for the underlying optimization problem along the BnB iterations. Our proposed algorithm Quant-BnB shows significant speedups compared to existing approaches for shallow optimal trees on various real datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/mazumder22a/mazumder22a.pdf", "supp": "", "pdf_size": 495636, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9194953273647842731&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Operations Research Center, MIT, Cambridge, USA+ Sloan School of Management and Center for Statistics, MIT, Cambridge, USA; Operations Research Center, MIT, Cambridge, USA+ Sloan School of Management and Center for Statistics, MIT, Cambridge, USA; Operations Research Center, MIT, Cambridge, USA", "aff_domain": "mit.edu; ; ", "email": "mit.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/mazumder22a.html", "aff_unique_index": "0+0;0+0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Operations Research Center", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0+0;0+0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Quantification and Analysis of Layer-wise and Pixel-wise Information Discarding", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17919", "id": "17919", "proceeding": "https://proceedings.mlr.press/v162/ma22b.html", "poster": "/media/PosterPDFs/ICML%202022/ca8155f4d27f205953f9d3d7974bdd70.png?t=1657525513.3527792", "slides": "", "author_site": "Haotian Ma, Hao Zhang, Fan Zhou, Yinqing Zhang, Quanshi Zhang", "author": "Haotian Ma; Hao Zhang; Fan Zhou; Yinqing Zhang; Quanshi Zhang", "abstract": "This paper presents a method to explain how the information of each input variable is gradually discarded during the forward propagation in a deep neural network (DNN), which provides new perspectives to explain DNNs. We define two types of entropy-based metrics, i.e. (1) the discarding of pixel-wise information used in the forward propagation, and (2) the uncertainty of the input reconstruction, to measure input information contained by a specific layer from two perspectives. Unlike previous attribution metrics, the proposed metrics ensure the fairness of comparisons between different layers of different DNNs. We can use these metrics to analyze the efficiency of information processing in DNNs, which exhibits strong connections to the performance of DNNs. We analyze information discarding in a pixel-wise manner, which is different from the information bottleneck theory measuring feature information w.r.t. the sample distribution. Experiments have shown the effectiveness of our metrics in analyzing classic DNNs and explaining existing deep-learning techniques. The code is available at https://github.com/haotianSustc/deepinfo.", "bibtex": "@InProceedings{pmlr-v162-ma22b,\n title = \t {Quantification and Analysis of Layer-wise and Pixel-wise Information Discarding},\n author = {Ma, Haotian and Zhang, Hao and Zhou, Fan and Zhang, Yinqing and Zhang, Quanshi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14664--14698},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ma22b/ma22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/ma22b.html},\n abstract = \t {This paper presents a method to explain how the information of each input variable is gradually discarded during the forward propagation in a deep neural network (DNN), which provides new perspectives to explain DNNs. We define two types of entropy-based metrics, i.e. (1) the discarding of pixel-wise information used in the forward propagation, and (2) the uncertainty of the input reconstruction, to measure input information contained by a specific layer from two perspectives. Unlike previous attribution metrics, the proposed metrics ensure the fairness of comparisons between different layers of different DNNs. We can use these metrics to analyze the efficiency of information processing in DNNs, which exhibits strong connections to the performance of DNNs. We analyze information discarding in a pixel-wise manner, which is different from the information bottleneck theory measuring feature information w.r.t. the sample distribution. Experiments have shown the effectiveness of our metrics in analyzing classic DNNs and explaining existing deep-learning techniques. The code is available at https://github.com/haotianSustc/deepinfo.}\n}", "pdf": "https://proceedings.mlr.press/v162/ma22b/ma22b.pdf", "supp": "", "pdf_size": 15718289, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11032731236809271983&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Shanghai Jiao Tong University, Shanghai, China+Southern University of Science and Technology, Shenzhen, China; Southern University of Science and Technology, Shenzhen, China; Shanghai Jiao Tong University, Shanghai, China; Shanghai Jiao Tong University, Shanghai, China; Shanghai Jiao Tong University, Shanghai, China", "aff_domain": "usc.edu;usc.edu;usc.edu;usc.edu;sjtu.edu.cn", "email": "usc.edu;usc.edu;usc.edu;usc.edu;sjtu.edu.cn", "github": "https://github.com/haotianSustc/deepinfo", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/ma22b.html", "aff_unique_index": "0+1;1;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University;Southern University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.sustech.edu.cn", "aff_unique_abbr": "SJTU;SUSTech", "aff_campus_unique_index": "0+1;1;0;0;0", "aff_campus_unique": "Shanghai;Shenzhen", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Quantifying and Learning Linear Symmetry-Based Disentanglement", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17701", "id": "17701", "proceeding": "https://proceedings.mlr.press/v162/tonnaer22a.html", "poster": "/media/PosterPDFs/ICML%202022/60cb558c40e4f18479664069d9642d5a_wsSloPf.png?t=1657376401.739206", "slides": "", "author_site": "Loek Tonnaer, Luis Armando Perez Rey, Vlado Menkovski, Mike Holenderski, Jacobus Portegies", "author": "Loek Tonnaer; Luis Armando Perez Rey; Vlado Menkovski; Mike Holenderski; Jim Portegies", "abstract": "The definition of Linear Symmetry-Based Disentanglement (LSBD) formalizes the notion of linearly disentangled representations, but there is currently no metric to quantify LSBD. Such a metric is crucial to evaluate LSBD methods and to compare them to previous understandings of disentanglement. We propose D_LSBD, a mathematically sound metric to quantify LSBD, and provide a practical implementation for SO(2) groups. Furthermore, from this metric we derive LSBD-VAE, a semi-supervised method to learn LSBD representations. We demonstrate the utility of our metric by showing that (1) common VAE-based disentanglement methods don\u2019t learn LSBD representations, (2) LSBD-VAE, as well as other recent methods, can learn LSBD representations needing only limited supervision on transformations, and (3) various desirable properties expressed by existing disentanglement metrics are also achieved by LSBD representations.", "bibtex": "@InProceedings{pmlr-v162-tonnaer22a,\n title = \t {Quantifying and Learning Linear Symmetry-Based Disentanglement},\n author = {Tonnaer, Loek and Rey, Luis Armando Perez and Menkovski, Vlado and Holenderski, Mike and Portegies, Jim},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21584--21608},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tonnaer22a/tonnaer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tonnaer22a.html},\n abstract = \t {The definition of Linear Symmetry-Based Disentanglement (LSBD) formalizes the notion of linearly disentangled representations, but there is currently no metric to quantify LSBD. Such a metric is crucial to evaluate LSBD methods and to compare them to previous understandings of disentanglement. We propose D_LSBD, a mathematically sound metric to quantify LSBD, and provide a practical implementation for SO(2) groups. Furthermore, from this metric we derive LSBD-VAE, a semi-supervised method to learn LSBD representations. We demonstrate the utility of our metric by showing that (1) common VAE-based disentanglement methods don\u2019t learn LSBD representations, (2) LSBD-VAE, as well as other recent methods, can learn LSBD representations needing only limited supervision on transformations, and (3) various desirable properties expressed by existing disentanglement metrics are also achieved by LSBD representations.}\n}", "pdf": "https://proceedings.mlr.press/v162/tonnaer22a/tonnaer22a.pdf", "supp": "", "pdf_size": 28880900, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11951723712936247797&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Eindhoven University of Technology (TU/e), Eindhoven, The Netherlands + Eindhoven Artificial Intelligence Systems Institute (EAISI), Eindhoven, the Netherlands; Eindhoven University of Technology (TU/e), Eindhoven, The Netherlands + Eindhoven Artificial Intelligence Systems Institute (EAISI), Eindhoven, the Netherlands + Prosus, Amsterdam, The Netherlands; Eindhoven University of Technology (TU/e), Eindhoven, The Netherlands + Eindhoven Artificial Intelligence Systems Institute (EAISI), Eindhoven, the Netherlands; Eindhoven University of Technology (TU/e), Eindhoven, The Netherlands + Eindhoven Artificial Intelligence Systems Institute (EAISI), Eindhoven, the Netherlands; Eindhoven University of Technology (TU/e), Eindhoven, The Netherlands + Eindhoven Artificial Intelligence Systems Institute (EAISI), Eindhoven, the Netherlands", "aff_domain": "tue.nl;tue.nl; ; ; ", "email": "tue.nl;tue.nl; ; ; ", "github": "https://github.com/luis-armando-perez-rey/lsbd-vae", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/tonnaer22a.html", "aff_unique_index": "0+1;0+1+2;0+1;0+1;0+1", "aff_unique_norm": "Eindhoven University of Technology;Eindhoven Artificial Intelligence Systems Institute;Prosus", "aff_unique_dep": ";Artificial Intelligence Systems;", "aff_unique_url": "https://www.tue.nl;;https://www.prosus.com", "aff_unique_abbr": "TU/e;EAISI;", "aff_campus_unique_index": "0+0;0+0+1;0+0;0+0;0+0", "aff_campus_unique": "Eindhoven;Amsterdam", "aff_country_unique_index": "0+0;0+0+0;0+0;0+0;0+0", "aff_country_unique": "Netherlands" }, { "title": "Quantum-Inspired Algorithms from Randomized Numerical Linear Algebra", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16653", "id": "16653", "proceeding": "https://proceedings.mlr.press/v162/chepurko22a.html", "poster": "/media/PosterPDFs/ICML%202022/93189dd27c5c3221f5687b74bcba0ab6.png?t=1657690373.1958141", "slides": "", "author_site": "Nadiia Chepurko, Kenneth Clarkson, Lior Horesh, Honghao Lin, David Woodruff", "author": "Nadiia Chepurko; Kenneth Clarkson; Lior Horesh; Honghao Lin; David Woodruff", "abstract": "We create classical (non-quantum) dynamic data structures supporting queries for recommender systems and least-squares regression that are comparable to their quantum analogues. De-quantizing such algorithms has received a flurry of attention in recent years; we obtain sharper bounds for these problems. More significantly, we achieve these improvements by arguing that the previous quantum-inspired algorithms for these problems are doing leverage or ridge-leverage score sampling in disguise; these are powerful and standard techniques in randomized numerical linear algebra. With this recognition, we are able to employ the large body of work in numerical linear algebra to obtain algorithms for these problems that are simpler or faster (or both) than existing approaches. Our experiments demonstrate that the proposed data structures also work well on real-world datasets.", "bibtex": "@InProceedings{pmlr-v162-chepurko22a,\n title = \t {Quantum-Inspired Algorithms from Randomized Numerical Linear Algebra},\n author = {Chepurko, Nadiia and Clarkson, Kenneth and Horesh, Lior and Lin, Honghao and Woodruff, David},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3879--3900},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chepurko22a/chepurko22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chepurko22a.html},\n abstract = \t {We create classical (non-quantum) dynamic data structures supporting queries for recommender systems and least-squares regression that are comparable to their quantum analogues. De-quantizing such algorithms has received a flurry of attention in recent years; we obtain sharper bounds for these problems. More significantly, we achieve these improvements by arguing that the previous quantum-inspired algorithms for these problems are doing leverage or ridge-leverage score sampling in disguise; these are powerful and standard techniques in randomized numerical linear algebra. With this recognition, we are able to employ the large body of work in numerical linear algebra to obtain algorithms for these problems that are simpler or faster (or both) than existing approaches. Our experiments demonstrate that the proposed data structures also work well on real-world datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/chepurko22a/chepurko22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chepurko22a-supp.zip", "pdf_size": 498304, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8202214935414637469&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/chepurko22a.html" }, { "title": "Query-Efficient and Scalable Black-Box Adversarial Attacks on Discrete Sequential Data via Bayesian Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17915", "id": "17915", "proceeding": "https://proceedings.mlr.press/v162/lee22h.html", "poster": "/media/PosterPDFs/ICML%202022/50abc3e730e36b387ca8e02c26dc0a22.png?t=1657335018.5697575", "slides": "", "author_site": "Deokjae Lee, Seungyong Moon, Junhyeok Lee, Hyun Oh Song", "author": "Deokjae Lee; Seungyong Moon; Junhyeok Lee; Hyun Oh Song", "abstract": "We focus on the problem of adversarial attacks against models on discrete sequential data in the black-box setting where the attacker aims to craft adversarial examples with limited query access to the victim model. Existing black-box attacks, mostly based on greedy algorithms, find adversarial examples using pre-computed key positions to perturb, which severely limits the search space and might result in suboptimal solutions. To this end, we propose a query-efficient black-box attack using Bayesian optimization, which dynamically computes important positions using an automatic relevance determination (ARD) categorical kernel. We introduce block decomposition and history subsampling techniques to improve the scalability of Bayesian optimization when an input sequence becomes long. Moreover, we develop a post-optimization algorithm that finds adversarial examples with smaller perturbation size. Experiments on natural language and protein classification tasks demonstrate that our method consistently achieves higher attack success rate with significant reduction in query count and modification rate compared to the previous state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v162-lee22h,\n title = \t {Query-Efficient and Scalable Black-Box Adversarial Attacks on Discrete Sequential Data via {B}ayesian Optimization},\n author = {Lee, Deokjae and Moon, Seungyong and Lee, Junhyeok and Song, Hyun Oh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12478--12497},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lee22h/lee22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/lee22h.html},\n abstract = \t {We focus on the problem of adversarial attacks against models on discrete sequential data in the black-box setting where the attacker aims to craft adversarial examples with limited query access to the victim model. Existing black-box attacks, mostly based on greedy algorithms, find adversarial examples using pre-computed key positions to perturb, which severely limits the search space and might result in suboptimal solutions. To this end, we propose a query-efficient black-box attack using Bayesian optimization, which dynamically computes important positions using an automatic relevance determination (ARD) categorical kernel. We introduce block decomposition and history subsampling techniques to improve the scalability of Bayesian optimization when an input sequence becomes long. Moreover, we develop a post-optimization algorithm that finds adversarial examples with smaller perturbation size. Experiments on natural language and protein classification tasks demonstrate that our method consistently achieves higher attack success rate with significant reduction in query count and modification rate compared to the previous state-of-the-art methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/lee22h/lee22h.pdf", "supp": "", "pdf_size": 506636, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10043868339521505770&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science and Engineering, Seoul National University, Seoul, Korea; Department of Computer Science and Engineering, Seoul National University, Seoul, Korea; Department of Computer Science and Engineering, Seoul National University, Seoul, Korea; Department of Computer Science and Engineering, Seoul National University, Seoul, Korea", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "email": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/lee22h.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "RECAPP: Crafting a More Efficient Catalyst for Convex Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18351", "id": "18351", "proceeding": "https://proceedings.mlr.press/v162/carmon22a.html", "poster": "/media/PosterPDFs/ICML%202022/dead35fa1512ad67301d09326177c42f.png?t=1658077387.4798229", "slides": "", "author_site": "Yair Carmon, Arun Jambulapati, Yujia Jin, Aaron Sidford", "author": "Yair Carmon; Arun Jambulapati; Yujia Jin; Aaron Sidford", "abstract": "The accelerated proximal point method (APPA), also known as \"Catalyst\", is a well-established reduction from convex optimization to approximate proximal point computation (i.e., regularized minimization). This reduction is conceptually elegant and yields strong convergence rate guarantees. However, these rates feature an extraneous logarithmic term arising from the need to compute each proximal point to high accuracy. In this work, we propose a novel Relaxed Error Criterion for Accelerated Proximal Point (RECAPP) that eliminates the need for high accuracy subproblem solutions. We apply RECAPP to two canonical problems: finite-sum and max-structured minimization. For finite-sum problems, we match the best known complexity, previously obtained by carefully-designed problem-specific algorithms. For minimizing max_y f(x,y) where f is convex in x and strongly-concave in y, we improve on the best known (Catalyst-based) bound by a logarithmic factor.", "bibtex": "@InProceedings{pmlr-v162-carmon22a,\n title = \t {{RECAPP}: Crafting a More Efficient Catalyst for Convex Optimization},\n author = {Carmon, Yair and Jambulapati, Arun and Jin, Yujia and Sidford, Aaron},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2658--2685},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/carmon22a/carmon22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/carmon22a.html},\n abstract = \t {The accelerated proximal point method (APPA), also known as \"Catalyst\", is a well-established reduction from convex optimization to approximate proximal point computation (i.e., regularized minimization). This reduction is conceptually elegant and yields strong convergence rate guarantees. However, these rates feature an extraneous logarithmic term arising from the need to compute each proximal point to high accuracy. In this work, we propose a novel Relaxed Error Criterion for Accelerated Proximal Point (RECAPP) that eliminates the need for high accuracy subproblem solutions. We apply RECAPP to two canonical problems: finite-sum and max-structured minimization. For finite-sum problems, we match the best known complexity, previously obtained by carefully-designed problem-specific algorithms. For minimizing max_y f(x,y) where f is convex in x and strongly-concave in y, we improve on the best known (Catalyst-based) bound by a logarithmic factor.}\n}", "pdf": "https://proceedings.mlr.press/v162/carmon22a/carmon22a.pdf", "supp": "", "pdf_size": 575760, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7906072571653012949&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Tel Aviv University; Stanford University; Stanford University; Stanford University", "aff_domain": "tau.ac.il;stanford.edu;stanford.edu;stanford.edu", "email": "tau.ac.il;stanford.edu;stanford.edu;stanford.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/carmon22a.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Tel Aviv University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tau.ac.il;https://www.stanford.edu", "aff_unique_abbr": "TAU;Stanford", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Israel;United States" }, { "title": "REvolveR: Continuous Evolutionary Models for Robot-to-robot Policy Transfer", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18303", "id": "18303", "proceeding": "https://proceedings.mlr.press/v162/liu22p.html", "poster": "/media/PosterPDFs/ICML%202022/33e8075e9970de0cfea955afd4644bb2_CuGkecl.png?t=1657592073.3102412", "slides": "", "author_site": "Xingyu Liu, Deepak Pathak, Kris Kitani", "author": "Xingyu Liu; Deepak Pathak; Kris Kitani", "abstract": "A popular paradigm in robotic learning is to train a policy from scratch for every new robot. This is not only inefficient but also often impractical for complex robots. In this work, we consider the problem of transferring a policy across two different robots with significantly different parameters such as kinematics and morphology. Existing approaches that train a new policy by matching the action or state transition distribution, including imitation learning methods, fail due to optimal action and/or state distribution being mismatched in different robots. In this paper, we propose a novel method named REvolveR of using continuous evolutionary models for robotic policy transfer implemented in a physics simulator. We interpolate between the source robot and the target robot by finding a continuous evolutionary change of robot parameters. An expert policy on the source robot is transferred through training on a sequence of intermediate robots that gradually evolve into the target robot. Experiments on a physics simulator show that the proposed continuous evolutionary model can effectively transfer the policy across robots and achieve superior sample efficiency on new robots. The proposed method is especially advantageous in sparse reward settings where exploration can be significantly reduced.", "bibtex": "@InProceedings{pmlr-v162-liu22p,\n title = \t {{RE}volve{R}: Continuous Evolutionary Models for Robot-to-robot Policy Transfer},\n author = {Liu, Xingyu and Pathak, Deepak and Kitani, Kris},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13995--14007},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22p/liu22p.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22p.html},\n abstract = \t {A popular paradigm in robotic learning is to train a policy from scratch for every new robot. This is not only inefficient but also often impractical for complex robots. In this work, we consider the problem of transferring a policy across two different robots with significantly different parameters such as kinematics and morphology. Existing approaches that train a new policy by matching the action or state transition distribution, including imitation learning methods, fail due to optimal action and/or state distribution being mismatched in different robots. In this paper, we propose a novel method named REvolveR of using continuous evolutionary models for robotic policy transfer implemented in a physics simulator. We interpolate between the source robot and the target robot by finding a continuous evolutionary change of robot parameters. An expert policy on the source robot is transferred through training on a sequence of intermediate robots that gradually evolve into the target robot. Experiments on a physics simulator show that the proposed continuous evolutionary model can effectively transfer the policy across robots and achieve superior sample efficiency on new robots. The proposed method is especially advantageous in sparse reward settings where exploration can be significantly reduced.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22p/liu22p.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/liu22p-supp.zip", "pdf_size": 25831911, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4925772100401553485&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "The Robotics Institute, Carnegie Mellon University, Pittsburgh, PA 15213, USA; The Robotics Institute, Carnegie Mellon University, Pittsburgh, PA 15213, USA; The Robotics Institute, Carnegie Mellon University, Pittsburgh, PA 15213, USA", "aff_domain": "cs.cmu.edu; ; ", "email": "cs.cmu.edu; ; ", "github": "https://github.com/xingyul/revolver", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/liu22p.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "The Robotics Institute", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "ROCK: Causal Inference Principles for Reasoning about Commonsense Causality", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16471", "id": "16471", "proceeding": "https://proceedings.mlr.press/v162/zhang22am.html", "poster": "/media/PosterPDFs/ICML%202022/28b60a16b55fd531047c0c958ce14b95_MYqLnYf.png?t=1657910824.4132757", "slides": "/media/icml-2022/Slides/16471_R9X6Y4u.pdf", "author_site": "Jiayao Zhang, Hongming ZHANG, Weijie Su, Dan Roth", "author": "Jiayao Zhang; Hongming Zhang; Weijie Su; Dan Roth", "abstract": "Commonsense causality reasoning (CCR) aims at identifying plausible causes and effects in natural language descriptions that are deemed reasonable by an average person. Although being of great academic and practical interest, this problem is still shadowed by the lack of a well-posed theoretical framework; existing work usually relies on deep language models wholeheartedly, and is potentially susceptible to confounding co-occurrences. Motivated by classical causal principles, we articulate the central question of CCR and draw parallels between human subjects in observational studies and natural languages to adopt CCR to the potential-outcomes framework, which is the first such attempt for commonsense tasks. We propose a novel framework, ROCK, to Reason O(A)bout Commonsense K(C)ausality, which utilizes temporal signals as incidental supervision, and balances confounding effects using temporal propensities that are analogous to propensity scores. The ROCK implementation is modular and zero-shot, and demonstrates good CCR capabilities.", "bibtex": "@InProceedings{pmlr-v162-zhang22am,\n title = \t {{ROCK}: Causal Inference Principles for Reasoning about Commonsense Causality},\n author = {Zhang, Jiayao and Zhang, Hongming and Su, Weijie and Roth, Dan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26750--26771},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22am/zhang22am.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22am.html},\n abstract = \t {Commonsense causality reasoning (CCR) aims at identifying plausible causes and effects in natural language descriptions that are deemed reasonable by an average person. Although being of great academic and practical interest, this problem is still shadowed by the lack of a well-posed theoretical framework; existing work usually relies on deep language models wholeheartedly, and is potentially susceptible to confounding co-occurrences. Motivated by classical causal principles, we articulate the central question of CCR and draw parallels between human subjects in observational studies and natural languages to adopt CCR to the potential-outcomes framework, which is the first such attempt for commonsense tasks. We propose a novel framework, ROCK, to Reason O(A)bout Commonsense K(C)ausality, which utilizes temporal signals as incidental supervision, and balances confounding effects using temporal propensities that are analogous to propensity scores. The ROCK implementation is modular and zero-shot, and demonstrates good CCR capabilities.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22am/zhang22am.pdf", "supp": "", "pdf_size": 791261, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4757630172142505662&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Cognitive Computation Group, University of Pennsylvania, USA + Department of Statistics and Data Science, University of Pennsylvania, USA; Cognitive Computation Group, University of Pennsylvania, USA + Tencent AI Lab Seattle, USA; Department of Statistics and Data Science, University of Pennsylvania, USA; Cognitive Computation Group, University of Pennsylvania, USA + Amazon AWS AI Labs, USA", "aff_domain": "upenn.edu;upenn.edu;upenn.edu;upenn.edu", "email": "upenn.edu;upenn.edu;upenn.edu;upenn.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhang22am.html", "aff_unique_index": "0+0;0+1;0;0+2", "aff_unique_norm": "University of Pennsylvania;Tencent;Amazon", "aff_unique_dep": "Cognitive Computation Group;Tencent AI Lab;AI Labs", "aff_unique_url": "https://www.upenn.edu;https://ai.tencent.com;https://aws.amazon.com", "aff_unique_abbr": "UPenn;Tencent AI Lab;Amazon AWS AI Labs", "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0+0;0+0;0;0+0", "aff_country_unique": "United States" }, { "title": "RUMs from Head-to-Head Contests", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16895", "id": "16895", "proceeding": "https://proceedings.mlr.press/v162/almanza22a.html", "poster": "/media/PosterPDFs/ICML%202022/90cc440b1b8caa520c562ac4e4bbcb51.png?t=1657974411.006212", "slides": "", "author_site": "Matteo Almanza, Flavio Chierichetti, Ravi Kumar, Alessandro Panconesi, Andrew Tomkins", "author": "Matteo Almanza; Flavio Chierichetti; Ravi Kumar; Alessandro Panconesi; Andrew Tomkins", "abstract": "Random utility models (RUMs) encode the likelihood that a particular item will be selected from a slate of competing items. RUMs are well-studied objects in both discrete choice theory and, more recently, in the machine learning community, as they encode a fairly broad notion of rational user behavior. In this paper, we focus on slates of size two representing head-to-head contests. Given a tournament matrix $M$ such that $M_{i,j}$ is the probability that item $j$ will be selected from $\\{i, j\\}$, we consider the problem of finding the RUM that most closely reproduces $M$. For this problem we obtain a polynomial-time algorithm returning a RUM that approximately minimizes the average error over the pairs. Our experiments show that RUMs can", "bibtex": "@InProceedings{pmlr-v162-almanza22a,\n title = \t {{RUM}s from Head-to-Head Contests},\n author = {Almanza, Matteo and Chierichetti, Flavio and Kumar, Ravi and Panconesi, Alessandro and Tomkins, Andrew},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {452--467},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/almanza22a/almanza22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/almanza22a.html},\n abstract = \t {Random utility models (RUMs) encode the likelihood that a particular item will be selected from a slate of competing items. RUMs are well-studied objects in both discrete choice theory and, more recently, in the machine learning community, as they encode a fairly broad notion of rational user behavior. In this paper, we focus on slates of size two representing head-to-head contests. Given a tournament matrix $M$ such that $M_{i,j}$ is the probability that item $j$ will be selected from $\\{i, j\\}$, we consider the problem of finding the RUM that most closely reproduces $M$. For this problem we obtain a polynomial-time algorithm returning a RUM that approximately minimizes the average error over the pairs. Our experiments show that RUMs can", "pdf": "https://proceedings.mlr.press/v162/almanza22a/almanza22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/almanza22a-supp.zip", "pdf_size": 443099, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1612272076085402541&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Algorand Labs + Sapienza University of Rome; Sapienza University of Rome; Google Mountain View; Sapienza University of Rome; Google Mountain View", "aff_domain": "di.uniroma1.it; ; ; ; ", "email": "di.uniroma1.it; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/almanza22a.html", "aff_unique_index": "0+1;1;2;1;2", "aff_unique_norm": "Algorand Labs;Sapienza University of Rome;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.algorand.com;https://www.uniroma1.it;https://www.google.com", "aff_unique_abbr": "Algorand;Sapienza;Google", "aff_campus_unique_index": "1;1;2;1;2", "aff_campus_unique": ";Rome;Mountain View", "aff_country_unique_index": "0+1;1;0;1;0", "aff_country_unique": "United States;Italy" }, { "title": "Random Forest Density Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17319", "id": "17319", "proceeding": "https://proceedings.mlr.press/v162/wen22c.html", "poster": "/media/PosterPDFs/ICML%202022/b8af7d0fbf094517781e0382102d7b27_tUR34bn.png?t=1657200086.6679888", "slides": "", "author_site": "Hongwei Wen, Hanyuan Hang", "author": "Hongwei Wen; Hanyuan Hang", "abstract": "We propose a density estimation algorithm called", "bibtex": "@InProceedings{pmlr-v162-wen22c,\n title = \t {Random Forest Density Estimation},\n author = {Wen, Hongwei and Hang, Hanyuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23701--23722},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wen22c/wen22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/wen22c.html},\n abstract = \t {We propose a density estimation algorithm called", "pdf": "https://proceedings.mlr.press/v162/wen22c/wen22c.pdf", "supp": "", "pdf_size": 1268563, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7077708015102159925&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Applied Mathematics, University of Twente, Enschede, The Netherlands; Department of Applied Mathematics, University of Twente, Enschede, The Netherlands", "aff_domain": "utwente.nl; ", "email": "utwente.nl; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/wen22c.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Twente", "aff_unique_dep": "Department of Applied Mathematics", "aff_unique_url": "https://www.utwente.nl", "aff_unique_abbr": "UT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Enschede", "aff_country_unique_index": "0;0", "aff_country_unique": "Netherlands" }, { "title": "Random Gegenbauer Features for Scalable Kernel Methods", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16735", "id": "16735", "proceeding": "https://proceedings.mlr.press/v162/han22g.html", "poster": "/media/PosterPDFs/ICML%202022/2ba2520186ee376e835ce7bf1554ef7b.png?t=1657807205.106697", "slides": "", "author_site": "Insu Han, Amir Zandieh, Haim Avron", "author": "Insu Han; Amir Zandieh; Haim Avron", "abstract": "We propose efficient random features for approximating a new and rich class of kernel functions that we refer to as Generalized Zonal Kernels (GZK). Our proposed GZK family, generalizes the zonal kernels (i.e., dot-product kernels on the unit sphere) by introducing radial factors in the Gegenbauer series expansion of these kernel functions. The GZK class of kernels includes a wide range of ubiquitous kernel functions such as the entirety of dot-product kernels as well as the Gaussian and the recently introduced Neural Tangent kernels. Interestingly, by exploiting the reproducing property of the Gegenbauer (Zonal) Harmonics, we can construct efficient random features for the GZK family based on randomly oriented Gegenbauer harmonics. We prove subspace embedding guarantees for our Gegenbauer features which ensures that our features can be used for approximately solving learning problems such as kernel k-means clustering, kernel ridge regression, etc. Empirical results show that our proposed features outperform recent kernel approximation methods.", "bibtex": "@InProceedings{pmlr-v162-han22g,\n title = \t {Random Gegenbauer Features for Scalable Kernel Methods},\n author = {Han, Insu and Zandieh, Amir and Avron, Haim},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8330--8358},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/han22g/han22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/han22g.html},\n abstract = \t {We propose efficient random features for approximating a new and rich class of kernel functions that we refer to as Generalized Zonal Kernels (GZK). Our proposed GZK family, generalizes the zonal kernels (i.e., dot-product kernels on the unit sphere) by introducing radial factors in the Gegenbauer series expansion of these kernel functions. The GZK class of kernels includes a wide range of ubiquitous kernel functions such as the entirety of dot-product kernels as well as the Gaussian and the recently introduced Neural Tangent kernels. Interestingly, by exploiting the reproducing property of the Gegenbauer (Zonal) Harmonics, we can construct efficient random features for the GZK family based on randomly oriented Gegenbauer harmonics. We prove subspace embedding guarantees for our Gegenbauer features which ensures that our features can be used for approximately solving learning problems such as kernel k-means clustering, kernel ridge regression, etc. Empirical results show that our proposed features outperform recent kernel approximation methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/han22g/han22g.pdf", "supp": "", "pdf_size": 683249, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8990769293238558900&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Yale University; Max-Planck-Institut f\u00a8ur Informatik; Tel Aviv University", "aff_domain": "yale.edu;mpi-inf.mpg.de;tauex.tau.ac.il", "email": "yale.edu;mpi-inf.mpg.de;tauex.tau.ac.il", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/han22g.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Yale University;Max-Planck-Institut f\u00fcr Informatik;Tel Aviv University", "aff_unique_dep": ";Informatik;", "aff_unique_url": "https://www.yale.edu;https://mpi-inf.mpg.de;https://www.tau.ac.il", "aff_unique_abbr": "Yale;MPII;TAU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2", "aff_country_unique": "United States;Germany;Israel" }, { "title": "RankSim: Ranking Similarity Regularization for Deep Imbalanced Regression", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16495", "id": "16495", "proceeding": "https://proceedings.mlr.press/v162/gong22a.html", "poster": "/media/PosterPDFs/ICML%202022/2321994d85d661d792223f647000c65f_Sef34lH.png?t=1657681126.5271764", "slides": "", "author_site": "Yu Gong, Greg Mori, Frederick Tung", "author": "Yu Gong; Greg Mori; Fred Tung", "abstract": "Data imbalance, in which a plurality of the data samples come from a small proportion of labels, poses a challenge in training deep neural networks. Unlike classification, in regression the labels are continuous, potentially boundless, and form a natural ordering. These distinct features of regression call for new techniques that leverage the additional information encoded in label-space relationships. This paper presents the RankSim (ranking similarity) regularizer for deep imbalanced regression, which encodes an inductive bias that samples that are closer in label space should also be closer in feature space. In contrast to recent distribution smoothing based approaches, RankSim captures both nearby and distant relationships: for a given data sample, RankSim encourages the sorted list of its neighbors in label space to match the sorted list of its neighbors in feature space. RankSim is complementary to conventional imbalanced learning techniques, including re-weighting, two-stage training, and distribution smoothing, and lifts the state-of-the-art performance on three imbalanced regression benchmarks: IMDB-WIKI-DIR, AgeDB-DIR, and STS-B-DIR.", "bibtex": "@InProceedings{pmlr-v162-gong22a,\n title = \t {{R}ank{S}im: Ranking Similarity Regularization for Deep Imbalanced Regression},\n author = {Gong, Yu and Mori, Greg and Tung, Fred},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7634--7649},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gong22a/gong22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gong22a.html},\n abstract = \t {Data imbalance, in which a plurality of the data samples come from a small proportion of labels, poses a challenge in training deep neural networks. Unlike classification, in regression the labels are continuous, potentially boundless, and form a natural ordering. These distinct features of regression call for new techniques that leverage the additional information encoded in label-space relationships. This paper presents the RankSim (ranking similarity) regularizer for deep imbalanced regression, which encodes an inductive bias that samples that are closer in label space should also be closer in feature space. In contrast to recent distribution smoothing based approaches, RankSim captures both nearby and distant relationships: for a given data sample, RankSim encourages the sorted list of its neighbors in label space to match the sorted list of its neighbors in feature space. RankSim is complementary to conventional imbalanced learning techniques, including re-weighting, two-stage training, and distribution smoothing, and lifts the state-of-the-art performance on three imbalanced regression benchmarks: IMDB-WIKI-DIR, AgeDB-DIR, and STS-B-DIR.}\n}", "pdf": "https://proceedings.mlr.press/v162/gong22a/gong22a.pdf", "supp": "", "pdf_size": 16206738, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2649008384099907500&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Simon Fraser University+Borealis AI; Simon Fraser University+Borealis AI; Borealis AI", "aff_domain": "sfu.ca; ; ", "email": "sfu.ca; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/gong22a.html", "aff_unique_index": "0+1;0+1;1", "aff_unique_norm": "Simon Fraser University;Borealis AI", "aff_unique_dep": ";", "aff_unique_url": "https://www.sfu.ca;https://www.borealisai.com", "aff_unique_abbr": "SFU;Borealis AI", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "Canada" }, { "title": "Re-evaluating Word Mover\u2019s Distance", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16145", "id": "16145", "proceeding": "https://proceedings.mlr.press/v162/sato22b.html", "poster": "/media/PosterPDFs/ICML%202022/ce758408f6ef98d7c7a7b786eca7b3a8.png?t=1657847470.7335584", "slides": "", "author_site": "Ryoma Sato, Makoto Yamada, Hisashi Kashima", "author": "Ryoma Sato; Makoto Yamada; Hisashi Kashima", "abstract": "The word mover\u2019s distance (WMD) is a fundamental technique for measuring the similarity of two documents. As the crux of WMD, it can take advantage of the underlying geometry of the word space by employing an optimal transport formulation. The original study on WMD reported that WMD outperforms classical baselines such as bag-of-words (BOW) and TF-IDF by significant margins in various datasets. In this paper, we point out that the evaluation in the original study could be misleading. We re-evaluate the performances of WMD and the classical baselines and find that the classical baselines are competitive with WMD if we employ an appropriate preprocessing, i.e., L1 normalization. In addition, we introduce an analogy between WMD and L1-normalized BOW and find that not only the performance of WMD but also the distance values resemble those of BOW in high dimensional spaces.", "bibtex": "@InProceedings{pmlr-v162-sato22b,\n title = \t {Re-evaluating Word Mover\u2019s Distance},\n author = {Sato, Ryoma and Yamada, Makoto and Kashima, Hisashi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19231--19249},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sato22b/sato22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/sato22b.html},\n abstract = \t {The word mover\u2019s distance (WMD) is a fundamental technique for measuring the similarity of two documents. As the crux of WMD, it can take advantage of the underlying geometry of the word space by employing an optimal transport formulation. The original study on WMD reported that WMD outperforms classical baselines such as bag-of-words (BOW) and TF-IDF by significant margins in various datasets. In this paper, we point out that the evaluation in the original study could be misleading. We re-evaluate the performances of WMD and the classical baselines and find that the classical baselines are competitive with WMD if we employ an appropriate preprocessing, i.e., L1 normalization. In addition, we introduce an analogy between WMD and L1-normalized BOW and find that not only the performance of WMD but also the distance values resemble those of BOW in high dimensional spaces.}\n}", "pdf": "https://proceedings.mlr.press/v162/sato22b/sato22b.pdf", "supp": "", "pdf_size": 581877, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3123355047455627011&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Kyoto University+RIKEN AIP; Kyoto University+RIKEN AIP; Kyoto University+RIKEN AIP", "aff_domain": "ml.ist.i.kyoto-u.ac.jp; ; ", "email": "ml.ist.i.kyoto-u.ac.jp; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sato22b.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "Kyoto University;RIKEN", "aff_unique_dep": ";Advanced Institute for Computational Science", "aff_unique_url": "https://www.kyoto-u.ac.jp;https://www.aip.riken.jp", "aff_unique_abbr": "Kyoto U;RIKEN AIP", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "Japan" }, { "title": "Reachability Constrained Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18133", "id": "18133", "proceeding": "https://proceedings.mlr.press/v162/yu22d.html", "poster": "", "slides": "/media/icml-2022/Slides/18133.pdf", "author_site": "Dongjie Yu, Haitong Ma, Shengbo Li, Jianyu Chen", "author": "Dongjie Yu; Haitong Ma; Shengbo Li; Jianyu Chen", "abstract": "Constrained reinforcement learning (CRL) has gained significant interest recently, since safety constraints satisfaction is critical for real-world problems. However, existing CRL methods constraining discounted cumulative costs generally lack rigorous definition and guarantee of safety. In contrast, in the safe control research, safety is defined as persistently satisfying certain state constraints. Such persistent safety is possible only on a subset of the state space, called feasible set, where an optimal largest feasible set exists for a given environment. Recent studies incorporate feasible sets into CRL with energy-based methods such as control barrier function (CBF), safety index (SI), and leverage prior conservative estimations of feasible sets, which harms the performance of the learned policy. To deal with this problem, this paper proposes the reachability CRL (RCRL) method by using reachability analysis to establish the novel self-consistency condition and characterize the feasible sets. The feasible sets are represented by the safety value function, which is used as the constraint in CRL. We use the multi-time scale stochastic approximation theory to prove that the proposed algorithm converges to a local optimum, where the largest feasible set can be guaranteed. Empirical results on different benchmarks validate the learned feasible set, the policy performance, and constraint satisfaction of RCRL, compared to CRL and safe control baselines.", "bibtex": "@InProceedings{pmlr-v162-yu22d,\n title = \t {Reachability Constrained Reinforcement Learning},\n author = {Yu, Dongjie and Ma, Haitong and Li, Shengbo and Chen, Jianyu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25636--25655},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yu22d/yu22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/yu22d.html},\n abstract = \t {Constrained reinforcement learning (CRL) has gained significant interest recently, since safety constraints satisfaction is critical for real-world problems. However, existing CRL methods constraining discounted cumulative costs generally lack rigorous definition and guarantee of safety. In contrast, in the safe control research, safety is defined as persistently satisfying certain state constraints. Such persistent safety is possible only on a subset of the state space, called feasible set, where an optimal largest feasible set exists for a given environment. Recent studies incorporate feasible sets into CRL with energy-based methods such as control barrier function (CBF), safety index (SI), and leverage prior conservative estimations of feasible sets, which harms the performance of the learned policy. To deal with this problem, this paper proposes the reachability CRL (RCRL) method by using reachability analysis to establish the novel self-consistency condition and characterize the feasible sets. The feasible sets are represented by the safety value function, which is used as the constraint in CRL. We use the multi-time scale stochastic approximation theory to prove that the proposed algorithm converges to a local optimum, where the largest feasible set can be guaranteed. Empirical results on different benchmarks validate the learned feasible set, the policy performance, and constraint satisfaction of RCRL, compared to CRL and safe control baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/yu22d/yu22d.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/yu22d-supp.zip", "pdf_size": 2686690, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2404570936990332675&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "School of Vehicle and Mobility, Tsinghua University, Beijing, China; School of Vehicle and Mobility, Tsinghua University, Beijing, China + John A. Paulson School of Engineering and Applied Sciences, Harvard University, Cambridge, Massachusetts, USA; School of Vehicle and Mobility, Tsinghua University, Beijing, China; Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, China + Shanghai Qizhi Institute, Shanghai, China", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "email": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/yu22d.html", "aff_unique_index": "0;0+1;0;0+2", "aff_unique_norm": "Tsinghua University;Harvard University;Shanghai Qizhi Institute", "aff_unique_dep": "School of Vehicle and Mobility;John A. Paulson School of Engineering and Applied Sciences;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.harvard.edu;", "aff_unique_abbr": "THU;Harvard;", "aff_campus_unique_index": "0;0+1;0;0+2", "aff_campus_unique": "Beijing;Cambridge;Shanghai", "aff_country_unique_index": "0;0+1;0;0+0", "aff_country_unique": "China;United States" }, { "title": "Reconstructing Nonlinear Dynamical Systems from Multi-Modal Time Series", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17181", "id": "17181", "proceeding": "https://proceedings.mlr.press/v162/kramer22a.html", "poster": "/media/PosterPDFs/ICML%202022/860052df4915de4d6c3deac9f7ebf5cc_I9JU7vo.png?t=1657543570.5467875", "slides": "", "author_site": "Daniel Kramer, Philine Bommer, Carlo Tombolini, Georgia Koppe, Daniel Durstewitz", "author": "Daniel Kramer; Philine L Bommer; Carlo Tombolini; Georgia Koppe; Daniel Durstewitz", "abstract": "Empirically observed time series in physics, biology, or medicine, are commonly generated by some underlying dynamical system (DS) which is the target of scientific interest. There is an increasing interest to harvest machine learning methods to reconstruct this latent DS in a data-driven, unsupervised way. In many areas of science it is common to sample time series observations from many data modalities simultaneously, e.g. electrophysiological and behavioral time series in a typical neuroscience experiment. However, current machine learning tools for reconstructing DSs usually focus on just one data modality. Here we propose a general framework for multi-modal data integration for the purpose of nonlinear DS reconstruction and the analysis of cross-modal relations. This framework is based on dynamically interpretable recurrent neural networks as general approximators of nonlinear DSs, coupled to sets of modality-specific decoder models from the class of generalized linear models. Both an expectation-maximization and a variational inference algorithm for model training are advanced and compared. We show on nonlinear DS benchmarks that our algorithms can efficiently compensate for too noisy or missing information in one data channel by exploiting other channels, and demonstrate on experimental neuroscience data how the algorithm learns to link different data domains to the underlying dynamics.", "bibtex": "@InProceedings{pmlr-v162-kramer22a,\n title = \t {Reconstructing Nonlinear Dynamical Systems from Multi-Modal Time Series},\n author = {Kramer, Daniel and Bommer, Philine L and Tombolini, Carlo and Koppe, Georgia and Durstewitz, Daniel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11613--11633},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kramer22a/kramer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kramer22a.html},\n abstract = \t {Empirically observed time series in physics, biology, or medicine, are commonly generated by some underlying dynamical system (DS) which is the target of scientific interest. There is an increasing interest to harvest machine learning methods to reconstruct this latent DS in a data-driven, unsupervised way. In many areas of science it is common to sample time series observations from many data modalities simultaneously, e.g. electrophysiological and behavioral time series in a typical neuroscience experiment. However, current machine learning tools for reconstructing DSs usually focus on just one data modality. Here we propose a general framework for multi-modal data integration for the purpose of nonlinear DS reconstruction and the analysis of cross-modal relations. This framework is based on dynamically interpretable recurrent neural networks as general approximators of nonlinear DSs, coupled to sets of modality-specific decoder models from the class of generalized linear models. Both an expectation-maximization and a variational inference algorithm for model training are advanced and compared. We show on nonlinear DS benchmarks that our algorithms can efficiently compensate for too noisy or missing information in one data channel by exploiting other channels, and demonstrate on experimental neuroscience data how the algorithm learns to link different data domains to the underlying dynamics.}\n}", "pdf": "https://proceedings.mlr.press/v162/kramer22a/kramer22a.pdf", "supp": "", "pdf_size": 8604500, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17080536605245199937&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Dept. of Theoretical Neuroscience, Central Institute of Mental Health, Heidelberg University, Germany; Dept. of Machine Learning, Technical University Berlin, Berlin, Germany; Dept. of Theoretical Neuroscience, Central Institute of Mental Health, Heidelberg University, Germany + Clinic for Psychiatry and Psychotherapy, Central Institute of Mental Health, Mannheim, Germany; Clinic for Psychiatry and Psychotherapy, Central Institute of Mental Health, Mannheim, Germany; Dept. of Theoretical Neuroscience, Central Institute of Mental Health, Heidelberg University, Germany + Faculty of Physics and Astronomy, Heidelberg University, Germany", "aff_domain": "zi-mannheim.de;tu-berlin.de; ; ;zi-mannheim.de", "email": "zi-mannheim.de;tu-berlin.de; ; ;zi-mannheim.de", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/kramer22a.html", "aff_unique_index": "0;1;0+2;2;0+0", "aff_unique_norm": "Heidelberg University;Technical University Berlin;Central Institute of Mental Health", "aff_unique_dep": "Dept. of Theoretical Neuroscience;Dept. of Machine Learning;Clinic for Psychiatry and Psychotherapy", "aff_unique_url": "https://www.uni-heidelberg.de;https://www.tu-berlin.de;", "aff_unique_abbr": "Uni HD;TUB;", "aff_campus_unique_index": "0;1;0+2;2;0+0", "aff_campus_unique": "Heidelberg;Berlin;Mannheim", "aff_country_unique_index": "0;0;0+0;0;0+0", "aff_country_unique": "Germany" }, { "title": "Recurrent Model-Free RL Can Be a Strong Baseline for Many POMDPs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16431", "id": "16431", "proceeding": "https://proceedings.mlr.press/v162/ni22a.html", "poster": "/media/PosterPDFs/ICML%202022/acc3e0404646c57502b480dc052c4fe1_Jsmj8QE.png?t=1657657911.0596356", "slides": "", "author_site": "Tianwei Ni, Benjamin Eysenbach, Ruslan Salakhutdinov", "author": "Tianwei Ni; Benjamin Eysenbach; Ruslan Salakhutdinov", "abstract": "Many problems in RL, such as meta-RL, robust RL, generalization in RL, and temporal credit assignment, can be cast as POMDPs. In theory, simply augmenting model-free RL with memory-based architectures, such as recurrent neural networks, provides a general approach to solving all types of POMDPs. However, prior work has found that such recurrent model-free RL methods tend to perform worse than more specialized algorithms that are designed for specific types of POMDPs. This paper revisits this claim. We find that careful architecture and hyperparameter decisions can often yield a recurrent model-free implementation that performs on par with (and occasionally substantially better than) more sophisticated recent techniques. We compare to 21 environments from 6 prior specialized methods and find that our implementation achieves greater sample efficiency and asymptotic performance than these methods on 18/21 environments. We also release a simple and efficient implementation of recurrent model-free RL for future work to use as a baseline for POMDPs.", "bibtex": "@InProceedings{pmlr-v162-ni22a,\n title = \t {Recurrent Model-Free {RL} Can Be a Strong Baseline for Many {POMDP}s},\n author = {Ni, Tianwei and Eysenbach, Benjamin and Salakhutdinov, Ruslan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16691--16723},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ni22a/ni22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ni22a.html},\n abstract = \t {Many problems in RL, such as meta-RL, robust RL, generalization in RL, and temporal credit assignment, can be cast as POMDPs. In theory, simply augmenting model-free RL with memory-based architectures, such as recurrent neural networks, provides a general approach to solving all types of POMDPs. However, prior work has found that such recurrent model-free RL methods tend to perform worse than more specialized algorithms that are designed for specific types of POMDPs. This paper revisits this claim. We find that careful architecture and hyperparameter decisions can often yield a recurrent model-free implementation that performs on par with (and occasionally substantially better than) more sophisticated recent techniques. We compare to 21 environments from 6 prior specialized methods and find that our implementation achieves greater sample efficiency and asymptotic performance than these methods on 18/21 environments. We also release a simple and efficient implementation of recurrent model-free RL for future work to use as a baseline for POMDPs.}\n}", "pdf": "https://proceedings.mlr.press/v162/ni22a/ni22a.pdf", "supp": "", "pdf_size": 13117185, "gs_citation": 130, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16490517683493315047&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Universit\u00e9 de Montr\u00e9al & Mila \u2013 Quebec AI Institute+ Carnegie Mellon University; Carnegie Mellon University; Carnegie Mellon University", "aff_domain": "mila.quebec;cs.cmu.edu; ", "email": "mila.quebec;cs.cmu.edu; ", "github": "https://github.com/twni2016/pomdp-baselines", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/ni22a.html", "aff_unique_index": "0+1;1;1", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.umontreal.ca;https://www.cmu.edu", "aff_unique_abbr": "UdeM;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1", "aff_country_unique": "Canada;United States" }, { "title": "Reducing Variance in Temporal-Difference Value Estimation via Ensemble of Deep Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17475", "id": "17475", "proceeding": "https://proceedings.mlr.press/v162/liang22c.html", "poster": "/media/PosterPDFs/ICML%202022/94739e5a5164b4d2396e253a11d57044_fxZ8cJM.png?t=1658318814.678837", "slides": "", "author_site": "Litian Liang, Yaosheng Xu, Stephen Mcaleer, Dailin Hu, Alexander Ihler, Pieter Abbeel, Roy Fox", "author": "Litian Liang; Yaosheng Xu; Stephen Mcaleer; Dailin Hu; Alexander Ihler; Pieter Abbeel; Roy Fox", "abstract": "In temporal-difference reinforcement learning algorithms, variance in value estimation can cause instability and overestimation of the maximal target value. Many algorithms have been proposed to reduce overestimation, including several recent ensemble methods, however none have shown success in sample-efficient learning through addressing estimation variance as the root cause of overestimation. In this paper, we propose MeanQ, a simple ensemble method that estimates target values as ensemble means. Despite its simplicity, MeanQ shows remarkable sample efficiency in experiments on the Atari Learning Environment benchmark. Importantly, we find that an ensemble of size 5 sufficiently reduces estimation variance to obviate the lagging target network, eliminating it as a source of bias and further gaining sample efficiency. We justify intuitively and empirically the design choices in MeanQ, including the necessity of independent experience sampling. On a set of 26 benchmark Atari environments, MeanQ outperforms all tested baselines, including the best available baseline, SUNRISE, at 100K interaction steps in 16/26 environments, and by 68% on average. MeanQ also outperforms Rainbow DQN at 500K steps in 21/26 environments, and by 49% on average, and achieves average human-level performance using 200K ($\\pm$100K) interaction steps. Our implementation is available at https://github.com/indylab/MeanQ.", "bibtex": "@InProceedings{pmlr-v162-liang22c,\n title = \t {Reducing Variance in Temporal-Difference Value Estimation via Ensemble of Deep Networks},\n author = {Liang, Litian and Xu, Yaosheng and Mcaleer, Stephen and Hu, Dailin and Ihler, Alexander and Abbeel, Pieter and Fox, Roy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13285--13301},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liang22c/liang22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/liang22c.html},\n abstract = \t {In temporal-difference reinforcement learning algorithms, variance in value estimation can cause instability and overestimation of the maximal target value. Many algorithms have been proposed to reduce overestimation, including several recent ensemble methods, however none have shown success in sample-efficient learning through addressing estimation variance as the root cause of overestimation. In this paper, we propose MeanQ, a simple ensemble method that estimates target values as ensemble means. Despite its simplicity, MeanQ shows remarkable sample efficiency in experiments on the Atari Learning Environment benchmark. Importantly, we find that an ensemble of size 5 sufficiently reduces estimation variance to obviate the lagging target network, eliminating it as a source of bias and further gaining sample efficiency. We justify intuitively and empirically the design choices in MeanQ, including the necessity of independent experience sampling. On a set of 26 benchmark Atari environments, MeanQ outperforms all tested baselines, including the best available baseline, SUNRISE, at 100K interaction steps in 16/26 environments, and by 68% on average. MeanQ also outperforms Rainbow DQN at 500K steps in 21/26 environments, and by 49% on average, and achieves average human-level performance using 200K ($\\pm$100K) interaction steps. Our implementation is available at https://github.com/indylab/MeanQ.}\n}", "pdf": "https://proceedings.mlr.press/v162/liang22c/liang22c.pdf", "supp": "", "pdf_size": 6184768, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5733035201533168571&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of California, Irvine; University of California, Irvine; Carnegie Mellon University; University of California, Irvine; University of California, Irvine; University of California, Berkeley; University of California, Irvine", "aff_domain": "uci.edu; ; ; ; ; ;uci.edu", "email": "uci.edu; ; ; ; ; ;uci.edu", "github": "https://github.com/indylab/MeanQ", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/liang22c.html", "aff_unique_index": "0;0;1;0;0;2;0", "aff_unique_norm": "University of California, Irvine;Carnegie Mellon University;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uci.edu;https://www.cmu.edu;https://www.berkeley.edu", "aff_unique_abbr": "UCI;CMU;UC Berkeley", "aff_campus_unique_index": "0;0;0;0;2;0", "aff_campus_unique": "Irvine;;Berkeley", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Refined Convergence Rates for Maximum Likelihood Estimation under Finite Mixture Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16845", "id": "16845", "proceeding": "https://proceedings.mlr.press/v162/manole22a.html", "poster": "/media/PosterPDFs/ICML%202022/ac4d17530106c3e3c2fb5e2dad0e51b7.png?t=1658026856.70126", "slides": "/media/icml-2022/Slides/16845.pdf", "author_site": "Tudor Manole, Nhat Ho", "author": "Tudor Manole; Nhat Ho", "abstract": "We revisit the classical problem of deriving convergence rates for the maximum likelihood estimator (MLE) in finite mixture models. The Wasserstein distance has become a standard loss function for the analysis of parameter estimation in these models, due in part to its ability to circumvent label switching and to accurately characterize the behaviour of fitted mixture components with vanishing weights. However, the Wasserstein distance is only able to capture the worst-case convergence rate among the remaining fitted mixture components. We demonstrate that when the log-likelihood function is penalized to discourage vanishing mixing weights, stronger loss functions can be derived to resolve this shortcoming of the Wasserstein distance. These new loss functions accurately capture the heterogeneity in convergence rates of fitted mixture components, and we use them to sharpen existing pointwise and uniform convergence rates in various classes of mixture models. In particular, these results imply that a subset of the components of the penalized MLE typically converge significantly faster than could have been anticipated from past work. We further show that some of these conclusions extend to the traditional MLE. Our theoretical findings are supported by a simulation study to illustrate these improved convergence rates.", "bibtex": "@InProceedings{pmlr-v162-manole22a,\n title = \t {Refined Convergence Rates for Maximum Likelihood Estimation under Finite Mixture Models},\n author = {Manole, Tudor and Ho, Nhat},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14979--15006},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/manole22a/manole22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/manole22a.html},\n abstract = \t {We revisit the classical problem of deriving convergence rates for the maximum likelihood estimator (MLE) in finite mixture models. The Wasserstein distance has become a standard loss function for the analysis of parameter estimation in these models, due in part to its ability to circumvent label switching and to accurately characterize the behaviour of fitted mixture components with vanishing weights. However, the Wasserstein distance is only able to capture the worst-case convergence rate among the remaining fitted mixture components. We demonstrate that when the log-likelihood function is penalized to discourage vanishing mixing weights, stronger loss functions can be derived to resolve this shortcoming of the Wasserstein distance. These new loss functions accurately capture the heterogeneity in convergence rates of fitted mixture components, and we use them to sharpen existing pointwise and uniform convergence rates in various classes of mixture models. In particular, these results imply that a subset of the components of the penalized MLE typically converge significantly faster than could have been anticipated from past work. We further show that some of these conclusions extend to the traditional MLE. Our theoretical findings are supported by a simulation study to illustrate these improved convergence rates.}\n}", "pdf": "https://proceedings.mlr.press/v162/manole22a/manole22a.pdf", "supp": "", "pdf_size": 666027, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15536015401615707970&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Statistics and Data Science, Carnegie Mellon University; Department of Statistics and Data Sciences, University of Texas, Austin", "aff_domain": "stat.cmu.edu; ", "email": "stat.cmu.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/manole22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Carnegie Mellon University;University of Texas at Austin", "aff_unique_dep": "Department of Statistics and Data Science;Department of Statistics and Data Sciences", "aff_unique_url": "https://www.cmu.edu;https://www.utexas.edu", "aff_unique_abbr": "CMU;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Region-Based Semantic Factorization in GANs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18033", "id": "18033", "proceeding": "https://proceedings.mlr.press/v162/zhu22j.html", "poster": "", "slides": "", "author_site": "Jiapeng Zhu, Yujun Shen, Yinghao Xu, Deli Zhao, Qifeng Chen", "author": "Jiapeng Zhu; Yujun Shen; Yinghao Xu; Deli Zhao; Qifeng Chen", "abstract": "Despite the rapid advancement of semantic discovery in the latent space of Generative Adversarial Networks (GANs), existing approaches either are limited to finding global attributes or rely on a number of segmentation masks to identify local attributes. In this work, we present a highly efficient algorithm to factorize the latent semantics learned by GANs concerning an arbitrary image region. Concretely, we revisit the task of local manipulation with pre-trained GANs and formulate region-based semantic discovery as a dual optimization problem. Through an appropriately defined generalized Rayleigh quotient, we manage to solve such a problem without any annotations or training. Experimental results on various state-of-the-art GAN models demonstrate the effectiveness of our approach, as well as its superiority over prior arts regarding precise control, region robustness, speed of implementation, and simplicity of use.", "bibtex": "@InProceedings{pmlr-v162-zhu22j,\n title = \t {Region-Based Semantic Factorization in {GAN}s},\n author = {Zhu, Jiapeng and Shen, Yujun and Xu, Yinghao and Zhao, Deli and Chen, Qifeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27612--27632},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhu22j/zhu22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhu22j.html},\n abstract = \t {Despite the rapid advancement of semantic discovery in the latent space of Generative Adversarial Networks (GANs), existing approaches either are limited to finding global attributes or rely on a number of segmentation masks to identify local attributes. In this work, we present a highly efficient algorithm to factorize the latent semantics learned by GANs concerning an arbitrary image region. Concretely, we revisit the task of local manipulation with pre-trained GANs and formulate region-based semantic discovery as a dual optimization problem. Through an appropriately defined generalized Rayleigh quotient, we manage to solve such a problem without any annotations or training. Experimental results on various state-of-the-art GAN models demonstrate the effectiveness of our approach, as well as its superiority over prior arts regarding precise control, region robustness, speed of implementation, and simplicity of use.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhu22j/zhu22j.pdf", "supp": "", "pdf_size": 24983755, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15967827822215112166&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of CSE, The Hong Kong University of Science and Technology, Hong Kong, China; ByteDance, Beijing, China; Department of IE, The Chinese University of Hong Kong, Hong Kong, China; Ant Research, Hangzhou, China; Department of CSE, The Hong Kong University of Science and Technology, Hong Kong, China", "aff_domain": "ust.hk; ; ; ;ust.hk", "email": "ust.hk; ; ; ;ust.hk", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zhu22j.html", "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "Hong Kong University of Science and Technology;ByteDance;Chinese University of Hong Kong;Ant Research", "aff_unique_dep": "Department of CSE;;Department of IE;", "aff_unique_url": "https://www.ust.hk;https://www.bytedance.com;https://www.cuhk.edu.hk;https://www.antgroup.com", "aff_unique_abbr": "HKUST;ByteDance;CUHK;Ant Research", "aff_campus_unique_index": "0;1;0;2;0", "aff_campus_unique": "Hong Kong;Beijing;Hangzhou", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Regret Bounds for Stochastic Shortest Path Problems with Linear Function Approximation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16385", "id": "16385", "proceeding": "https://proceedings.mlr.press/v162/vial22a.html", "poster": "/media/PosterPDFs/ICML%202022/ba9fab001f67381e56e410575874d967.png?t=1657386208.1627958", "slides": "", "author_site": "Daniel Vial, Advait Parulekar, Sanjay Shakkottai, R Srikant", "author": "Daniel Vial; Advait Parulekar; Sanjay Shakkottai; R Srikant", "abstract": "We propose an algorithm that uses linear function approximation (LFA) for stochastic shortest path (SSP). Under minimal assumptions, it obtains sublinear regret, is computationally efficient, and uses stationary policies. To our knowledge, this is the first such algorithm in the LFA literature (for SSP or other formulations). Our algorithm is a special case of a more general one, which achieves regret square root in the number of episodes given access to a computation oracle.", "bibtex": "@InProceedings{pmlr-v162-vial22a,\n title = \t {Regret Bounds for Stochastic Shortest Path Problems with Linear Function Approximation},\n author = {Vial, Daniel and Parulekar, Advait and Shakkottai, Sanjay and Srikant, R},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22203--22233},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vial22a/vial22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/vial22a.html},\n abstract = \t {We propose an algorithm that uses linear function approximation (LFA) for stochastic shortest path (SSP). Under minimal assumptions, it obtains sublinear regret, is computationally efficient, and uses stationary policies. To our knowledge, this is the first such algorithm in the LFA literature (for SSP or other formulations). Our algorithm is a special case of a more general one, which achieves regret square root in the number of episodes given access to a computation oracle.}\n}", "pdf": "https://proceedings.mlr.press/v162/vial22a/vial22a.pdf", "supp": "", "pdf_size": 501494, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11641513125491369722&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Electrical and Computer Engineering, University of Texas, Austin, TX, USA+Department of Electrical and Computer Engineering, University of Illinois, Urbana-Champaign, IL, USA; Department of Electrical and Computer Engineering, University of Texas, Austin, TX, USA; Department of Electrical and Computer Engineering, University of Texas, Austin, TX, USA; Department of Electrical and Computer Engineering, University of Illinois, Urbana-Champaign, IL, USA", "aff_domain": "utexas.edu; ; ; ", "email": "utexas.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/vial22a.html", "aff_unique_index": "0+1;0;0;1", "aff_unique_norm": "University of Texas at Austin;University of Illinois, Urbana-Champaign", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.utexas.edu;https://illinois.edu", "aff_unique_abbr": "UT Austin;UIUC", "aff_campus_unique_index": "0+1;0;0;1", "aff_campus_unique": "Austin;Urbana-Champaign", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Regret Minimization with Performative Feedback", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16821", "id": "16821", "proceeding": "https://proceedings.mlr.press/v162/jagadeesan22a.html", "poster": "/media/PosterPDFs/ICML%202022/92cc227532d17e56e07902b254dfad10.png?t=1658123462.2044668", "slides": "", "author_site": "Meena Jagadeesan, Tijana Zrnic, Celestine Mendler-D\u00fcnner", "author": "Meena Jagadeesan; Tijana Zrnic; Celestine Mendler-D\u00fcnner", "abstract": "In performative prediction, the deployment of a predictive model triggers a shift in the data distribution. As these shifts are typically unknown ahead of time, the learner needs to deploy a model to get feedback about the distribution it induces. We study the problem of finding near-optimal models under performativity while maintaining low regret. On the surface, this problem might seem equivalent to a bandit problem. However, it exhibits a fundamentally richer feedback structure that we refer to as performative feedback: after every deployment, the learner receives samples from the shifted distribution rather than bandit feedback about the reward. Our main contribution is regret bounds that scale only with the complexity of the distribution shifts and not that of the reward function. The key algorithmic idea is careful exploration of the distribution shifts that informs a novel construction of confidence bounds on the risk of unexplored models. The construction only relies on smoothness of the shifts and does not assume convexity. More broadly, our work establishes a conceptual approach for leveraging tools from the bandits literature for the purpose of regret minimization with performative feedback.", "bibtex": "@InProceedings{pmlr-v162-jagadeesan22a,\n title = \t {Regret Minimization with Performative Feedback},\n author = {Jagadeesan, Meena and Zrnic, Tijana and Mendler-D{\\\"u}nner, Celestine},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9760--9785},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jagadeesan22a/jagadeesan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jagadeesan22a.html},\n abstract = \t {In performative prediction, the deployment of a predictive model triggers a shift in the data distribution. As these shifts are typically unknown ahead of time, the learner needs to deploy a model to get feedback about the distribution it induces. We study the problem of finding near-optimal models under performativity while maintaining low regret. On the surface, this problem might seem equivalent to a bandit problem. However, it exhibits a fundamentally richer feedback structure that we refer to as performative feedback: after every deployment, the learner receives samples from the shifted distribution rather than bandit feedback about the reward. Our main contribution is regret bounds that scale only with the complexity of the distribution shifts and not that of the reward function. The key algorithmic idea is careful exploration of the distribution shifts that informs a novel construction of confidence bounds on the risk of unexplored models. The construction only relies on smoothness of the shifts and does not assume convexity. More broadly, our work establishes a conceptual approach for leveraging tools from the bandits literature for the purpose of regret minimization with performative feedback.}\n}", "pdf": "https://proceedings.mlr.press/v162/jagadeesan22a/jagadeesan22a.pdf", "supp": "", "pdf_size": 347408, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3196789686124148466&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "University of California, Berkeley; University of California, Berkeley; Max Planck Institute for Intelligent Systems, T\u00fcbingen", "aff_domain": "berkeley.edu;berkeley.edu;tuebingen.mpg.de", "email": "berkeley.edu;berkeley.edu;tuebingen.mpg.de", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/jagadeesan22a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "University of California, Berkeley;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.mpi-is.mpg.de", "aff_unique_abbr": "UC Berkeley;MPI-IS", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Berkeley;T\u00fcbingen", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Germany" }, { "title": "Regularizing a Model-based Policy Stationary Distribution to Stabilize Offline Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18409", "id": "18409", "proceeding": "https://proceedings.mlr.press/v162/yang22b.html", "poster": "/media/PosterPDFs/ICML%202022/cd7c230fc5deb01ff5f7b1be1acef9cf.png?t=1660244542.1248937", "slides": "/media/icml-2022/Slides/18409.pdf", "author_site": "Shentao Yang, Yihao Feng, Shujian Zhang, Mingyuan Zhou", "author": "Shentao Yang; Yihao Feng; Shujian Zhang; Mingyuan Zhou", "abstract": "Offline reinforcement learning (RL) extends the paradigm of classical RL algorithms to purely learning from static datasets, without interacting with the underlying environment during the learning process. A key challenge of offline RL is the instability of policy training, caused by the mismatch between the distribution of the offline data and the undiscounted stationary state-action distribution of the learned policy. To avoid the detrimental impact of distribution mismatch, we regularize the undiscounted stationary distribution of the current policy towards the offline data during the policy optimization process. Further, we train a dynamics model to both implement this regularization and better estimate the stationary distribution of the current policy, reducing the error induced by distribution mismatch. On a wide range of continuous-control offline RL datasets, our method indicates competitive performance, which validates our algorithm. The code is publicly available.", "bibtex": "@InProceedings{pmlr-v162-yang22b,\n title = \t {Regularizing a Model-based Policy Stationary Distribution to Stabilize Offline Reinforcement Learning},\n author = {Yang, Shentao and Feng, Yihao and Zhang, Shujian and Zhou, Mingyuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24980--25006},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22b/yang22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22b.html},\n abstract = \t {Offline reinforcement learning (RL) extends the paradigm of classical RL algorithms to purely learning from static datasets, without interacting with the underlying environment during the learning process. A key challenge of offline RL is the instability of policy training, caused by the mismatch between the distribution of the offline data and the undiscounted stationary state-action distribution of the learned policy. To avoid the detrimental impact of distribution mismatch, we regularize the undiscounted stationary distribution of the current policy towards the offline data during the policy optimization process. Further, we train a dynamics model to both implement this regularization and better estimate the stationary distribution of the current policy, reducing the error induced by distribution mismatch. On a wide range of continuous-control offline RL datasets, our method indicates competitive performance, which validates our algorithm. The code is publicly available.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22b/yang22b.pdf", "supp": "", "pdf_size": 933405, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1188226225988660555&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "aff": "McCombs School of Business; Department of Computer Science; Department of Statistics & Data Science; McCombs School of Business + Department of Statistics & Data Science", "aff_domain": "mccombs.utexas.edu; ; ;mccombs.utexas.edu", "email": "mccombs.utexas.edu; ; ;mccombs.utexas.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/yang22b.html", "aff_unique_index": "0;1;2;0+2", "aff_unique_norm": "University of Texas at Austin;Unknown Institution;University Affiliation Not Specified", "aff_unique_dep": "McCombs School of Business;Department of Computer Science;Department of Statistics & Data Science", "aff_unique_url": "https://mccombs.utexas.edu;;", "aff_unique_abbr": "UT Austin;;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States;" }, { "title": "Reinforcement Learning from Partial Observation: Linear Function Approximation with Provable Sample Efficiency", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17599", "id": "17599", "proceeding": "https://proceedings.mlr.press/v162/cai22c.html", "poster": "/media/PosterPDFs/ICML%202022/6c349155b122aa8ad5c877007e05f24f_c349Dp6.png?t=1658036298.79161", "slides": "", "author_site": "Qi Cai, Zhuoran Yang, Zhaoran Wang", "author": "Qi Cai; Zhuoran Yang; Zhaoran Wang", "abstract": "We study reinforcement learning for partially observed Markov decision processes (POMDPs) with infinite observation and state spaces, which remains less investigated theoretically. To this end, we make the first attempt at bridging partial observability and function approximation for a class of POMDPs with a linear structure. In detail, we propose a reinforcement learning algorithm (Optimistic Exploration via Adversarial Integral Equation or OP-TENET) that attains an $\\epsilon$-optimal policy within $O(1/\\epsilon^2)$ episodes. In particular, the sample complexity scales polynomially in the intrinsic dimension of the linear structure and is independent of the size of the observation and state spaces. The sample efficiency of OP-TENET is enabled by a sequence of ingredients: (i) a Bellman operator with finite memory, which represents the value function in a recursive manner, (ii) the identification and estimation of such an operator via an adversarial integral equation, which features a smoothed discriminator tailored to the linear structure, and (iii) the exploration of the observation and state spaces via optimism, which is based on quantifying the uncertainty in the adversarial integral equation.", "bibtex": "@InProceedings{pmlr-v162-cai22c,\n title = \t {Reinforcement Learning from Partial Observation: Linear Function Approximation with Provable Sample Efficiency},\n author = {Cai, Qi and Yang, Zhuoran and Wang, Zhaoran},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2485--2522},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cai22c/cai22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/cai22c.html},\n abstract = \t {We study reinforcement learning for partially observed Markov decision processes (POMDPs) with infinite observation and state spaces, which remains less investigated theoretically. To this end, we make the first attempt at bridging partial observability and function approximation for a class of POMDPs with a linear structure. In detail, we propose a reinforcement learning algorithm (Optimistic Exploration via Adversarial Integral Equation or OP-TENET) that attains an $\\epsilon$-optimal policy within $O(1/\\epsilon^2)$ episodes. In particular, the sample complexity scales polynomially in the intrinsic dimension of the linear structure and is independent of the size of the observation and state spaces. The sample efficiency of OP-TENET is enabled by a sequence of ingredients: (i) a Bellman operator with finite memory, which represents the value function in a recursive manner, (ii) the identification and estimation of such an operator via an adversarial integral equation, which features a smoothed discriminator tailored to the linear structure, and (iii) the exploration of the observation and state spaces via optimism, which is based on quantifying the uncertainty in the adversarial integral equation.}\n}", "pdf": "https://proceedings.mlr.press/v162/cai22c/cai22c.pdf", "supp": "", "pdf_size": 452626, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9969865630197808459&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Industrial Engineering and Management Sciences, Northwestern University, Evanston, USA; Department of Statistics and Data Science, Yale University, New Haven, USA; Department of Industrial Engineering and Management Sciences, Northwestern University, Evanston, USA", "aff_domain": "u.northwestern.edu;yale.edu;gmail.com", "email": "u.northwestern.edu;yale.edu;gmail.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/cai22c.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Northwestern University;Yale University", "aff_unique_dep": "Department of Industrial Engineering and Management Sciences;Department of Statistics and Data Science", "aff_unique_url": "https://www.northwestern.edu;https://www.yale.edu", "aff_unique_abbr": "NU;Yale", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Evanston;New Haven", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Reinforcement Learning with Action-Free Pre-Training from Videos", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17409", "id": "17409", "proceeding": "https://proceedings.mlr.press/v162/seo22a.html", "poster": "/media/PosterPDFs/ICML%202022/3323fe11e9595c09af38fe67567a9394_aeG6B8y.png?t=1657908055.2593186", "slides": "/media/icml-2022/Slides/17409_zpElo2C.pdf", "author_site": "Younggyo Seo, Kimin Lee, Stephen James, Pieter Abbeel", "author": "Younggyo Seo; Kimin Lee; Stephen L James; Pieter Abbeel", "abstract": "Recent unsupervised pre-training methods have shown to be effective on language and vision domains by learning useful representations for multiple downstream tasks. In this paper, we investigate if such unsupervised pre-training methods can also be effective for vision-based reinforcement learning (RL). To this end, we introduce a framework that learns representations useful for understanding the dynamics via generative pre-training on videos. Our framework consists of two phases: we pre-train an action-free latent video prediction model, and then utilize the pre-trained representations for efficiently learning action-conditional world models on unseen environments. To incorporate additional action inputs during fine-tuning, we introduce a new architecture that stacks an action-conditional latent prediction model on top of the pre-trained action-free prediction model. Moreover, for better exploration, we propose a video-based intrinsic bonus that leverages pre-trained representations. We demonstrate that our framework significantly improves both final performances and sample-efficiency of vision-based RL in a variety of manipulation and locomotion tasks. Code is available at \\url{https://github.com/younggyoseo/apv}.", "bibtex": "@InProceedings{pmlr-v162-seo22a,\n title = \t {Reinforcement Learning with Action-Free Pre-Training from Videos},\n author = {Seo, Younggyo and Lee, Kimin and James, Stephen L and Abbeel, Pieter},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19561--19579},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/seo22a/seo22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/seo22a.html},\n abstract = \t {Recent unsupervised pre-training methods have shown to be effective on language and vision domains by learning useful representations for multiple downstream tasks. In this paper, we investigate if such unsupervised pre-training methods can also be effective for vision-based reinforcement learning (RL). To this end, we introduce a framework that learns representations useful for understanding the dynamics via generative pre-training on videos. Our framework consists of two phases: we pre-train an action-free latent video prediction model, and then utilize the pre-trained representations for efficiently learning action-conditional world models on unseen environments. To incorporate additional action inputs during fine-tuning, we introduce a new architecture that stacks an action-conditional latent prediction model on top of the pre-trained action-free prediction model. Moreover, for better exploration, we propose a video-based intrinsic bonus that leverages pre-trained representations. We demonstrate that our framework significantly improves both final performances and sample-efficiency of vision-based RL in a variety of manipulation and locomotion tasks. Code is available at \\url{https://github.com/younggyoseo/apv}.}\n}", "pdf": "https://proceedings.mlr.press/v162/seo22a/seo22a.pdf", "supp": "", "pdf_size": 1265902, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6676654951334590185&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "KAIST+UC Berkeley; UC Berkeley; UC Berkeley; UC Berkeley", "aff_domain": "kaist.ac.kr; ; ; ", "email": "kaist.ac.kr; ; ; ", "github": "https://github.com/younggyoseo/apv", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/seo22a.html", "aff_unique_index": "0+1;1;1;1", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.berkeley.edu", "aff_unique_abbr": "KAIST;UC Berkeley", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0+1;1;1;1", "aff_country_unique": "South Korea;United States" }, { "title": "Removing Batch Normalization Boosts Adversarial Training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16541", "id": "16541", "proceeding": "https://proceedings.mlr.press/v162/wang22ap.html", "poster": "", "slides": "", "author_site": "Haotao Wang, Aston Zhang, Shuai Zheng, Xingjian Shi, Mu Li, Zhangyang \u201cAtlas\u201d Wang", "author": "Haotao Wang; Aston Zhang; Shuai Zheng; Xingjian Shi; Mu Li; Zhangyang Wang", "abstract": "Adversarial training (AT) defends deep neural networks against adversarial attacks. One challenge that limits its practical application is the performance degradation on clean samples. A major bottleneck identified by previous works is the widely used batch normalization (BN), which struggles to model the different statistics of clean and adversarial training samples in AT. Although the dominant approach is to extend BN to capture this mixture of distribution, we propose to completely eliminate this bottleneck by removing all BN layers in AT. Our normalizer-free robust training (NoFrost) method extends recent advances in normalizer-free networks to AT for its unexplored advantage on handling the mixture distribution challenge. We show that NoFrost achieves adversarial robustness with only a minor sacrifice on clean sample accuracy. On ImageNet with ResNet50, NoFrost achieves $74.06%$ clean accuracy, which drops merely $2.00%$ from standard training. In contrast, BN-based AT obtains $59.28%$ clean accuracy, suffering a significant $16.78%$ drop from standard training. In addition, NoFrost achieves a $23.56%$ adversarial robustness against PGD attack, which improves the $13.57%$ robustness in BN-based AT. We observe better model smoothness and larger decision margins from NoFrost, which make the models less sensitive to input perturbations and thus more robust. Moreover, when incorporating more data augmentations into NoFrost, it achieves comprehensive robustness against multiple distribution shifts. Code and pre-trained models are public at https://github.com/amazon-research/normalizer-free-robust-training.", "bibtex": "@InProceedings{pmlr-v162-wang22ap,\n title = \t {Removing Batch Normalization Boosts Adversarial Training},\n author = {Wang, Haotao and Zhang, Aston and Zheng, Shuai and Shi, Xingjian and Li, Mu and Wang, Zhangyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23433--23445},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22ap/wang22ap.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22ap.html},\n abstract = \t {Adversarial training (AT) defends deep neural networks against adversarial attacks. One challenge that limits its practical application is the performance degradation on clean samples. A major bottleneck identified by previous works is the widely used batch normalization (BN), which struggles to model the different statistics of clean and adversarial training samples in AT. Although the dominant approach is to extend BN to capture this mixture of distribution, we propose to completely eliminate this bottleneck by removing all BN layers in AT. Our normalizer-free robust training (NoFrost) method extends recent advances in normalizer-free networks to AT for its unexplored advantage on handling the mixture distribution challenge. We show that NoFrost achieves adversarial robustness with only a minor sacrifice on clean sample accuracy. On ImageNet with ResNet50, NoFrost achieves $74.06%$ clean accuracy, which drops merely $2.00%$ from standard training. In contrast, BN-based AT obtains $59.28%$ clean accuracy, suffering a significant $16.78%$ drop from standard training. In addition, NoFrost achieves a $23.56%$ adversarial robustness against PGD attack, which improves the $13.57%$ robustness in BN-based AT. We observe better model smoothness and larger decision margins from NoFrost, which make the models less sensitive to input perturbations and thus more robust. Moreover, when incorporating more data augmentations into NoFrost, it achieves comprehensive robustness against multiple distribution shifts. Code and pre-trained models are public at https://github.com/amazon-research/normalizer-free-robust-training.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22ap/wang22ap.pdf", "supp": "", "pdf_size": 1335322, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4233277386290159249&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "University of Texas at Austin + Amazon Web Services; Amazon Web Services; Amazon Web Services; Amazon Web Services; Amazon Web Services; University of Texas at Austin + Amazon Web Services", "aff_domain": "utexas.edu;amazon.com; ; ; ;utexas.edu", "email": "utexas.edu;amazon.com; ; ; ;utexas.edu", "github": "https://github.com/amazon-research/normalizer-free-robust-training", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wang22ap.html", "aff_unique_index": "0+1;1;1;1;1;0+1", "aff_unique_norm": "University of Texas at Austin;Amazon", "aff_unique_dep": ";Amazon Web Services", "aff_unique_url": "https://www.utexas.edu;https://aws.amazon.com", "aff_unique_abbr": "UT Austin;AWS", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0+0;0;0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "Representation Topology Divergence: A Method for Comparing Neural Network Representations.", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18187", "id": "18187", "proceeding": "https://proceedings.mlr.press/v162/barannikov22a.html", "poster": "/media/PosterPDFs/ICML%202022/342c472b95d00421be10e9512b532866_kJODVkm.png?t=1658425255.528457", "slides": "", "author_site": "Serguei Barannikov, Ilya Trofimov, Nikita Balabin, Evgeny Burnaev", "author": "Serguei Barannikov; Ilya Trofimov; Nikita Balabin; Evgeny Burnaev", "abstract": "Comparison of data representations is a complex multi-aspect problem. We propose a method for comparing two data representations. We introduce the Representation Topology Divergence (RTD) score measuring the dissimilarity in multi-scale topology between two point clouds of equal size with a one-to-one correspondence between points. The two data point clouds can lie in different ambient spaces. The RTD score is one of the few topological data analysis based practical methods applicable to real machine learning datasets. Experiments show the agreement of RTD with the intuitive assessment of data representation similarity. The proposed RTD score is sensitive to the data representation\u2019s fine topological structure. We use the RTD score to gain insights on neural networks representations in computer vision and NLP domains for various problems: training dynamics analysis, data distribution shift, transfer learning, ensemble learning, disentanglement assessment.", "bibtex": "@InProceedings{pmlr-v162-barannikov22a,\n title = \t {Representation Topology Divergence: A Method for Comparing Neural Network Representations.},\n author = {Barannikov, Serguei and Trofimov, Ilya and Balabin, Nikita and Burnaev, Evgeny},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1607--1626},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/barannikov22a/barannikov22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/barannikov22a.html},\n abstract = \t {Comparison of data representations is a complex multi-aspect problem. We propose a method for comparing two data representations. We introduce the Representation Topology Divergence (RTD) score measuring the dissimilarity in multi-scale topology between two point clouds of equal size with a one-to-one correspondence between points. The two data point clouds can lie in different ambient spaces. The RTD score is one of the few topological data analysis based practical methods applicable to real machine learning datasets. Experiments show the agreement of RTD with the intuitive assessment of data representation similarity. The proposed RTD score is sensitive to the data representation\u2019s fine topological structure. We use the RTD score to gain insights on neural networks representations in computer vision and NLP domains for various problems: training dynamics analysis, data distribution shift, transfer learning, ensemble learning, disentanglement assessment.}\n}", "pdf": "https://proceedings.mlr.press/v162/barannikov22a/barannikov22a.pdf", "supp": "", "pdf_size": 9009507, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11154211416501540994&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Skolkovo Institute of Science and Technology, Moscow, Russia+CNRS, Universit\u00e9 Paris Cit\u00e9, France; Skolkovo Institute of Science and Technology, Moscow, Russia; Skolkovo Institute of Science and Technology, Moscow, Russia; Skolkovo Institute of Science and Technology, Moscow, Russia+Artificial Intelligence Research Institute (AIRI), Moscow, Russia", "aff_domain": "skoltech.ru; ; ; ", "email": "skoltech.ru; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/barannikov22a.html", "aff_unique_index": "0+1;0;0;0+2", "aff_unique_norm": "Skolkovo Institute of Science and Technology;CNRS;Artificial Intelligence Research Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.skoltech.ru;https://www.cnrs.fr;", "aff_unique_abbr": "Skoltech;CNRS;AIRI", "aff_campus_unique_index": "0;0;0;0+0", "aff_campus_unique": "Moscow;", "aff_country_unique_index": "0+1;0;0;0+0", "aff_country_unique": "Russian Federation;France" }, { "title": "Residual-Based Sampling for Online Outlier-Robust PCA", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16101", "id": "16101", "proceeding": "https://proceedings.mlr.press/v162/zhu22i.html", "poster": "/media/PosterPDFs/ICML%202022/3cf2559725a9fdfa602ec8c887440f32.png?t=1658128795.1495857", "slides": "", "author_site": "Tianhao Zhu, Jie Shen", "author": "Tianhao Zhu; Jie Shen", "abstract": "Outlier-robust principal component analysis (ORPCA) has been broadly applied in scientific discovery in the last decades. In this paper, we study online ORPCA, an important variant that addresses the practical challenge that the data points arrive in a sequential manner and the goal is to recover the underlying subspace of the clean data with one pass of the data. Our main contribution is the first provable algorithm that enjoys comparable recovery guarantee to the best known batch algorithm, while significantly improving upon the state-of-the-art online ORPCA algorithms. The core technique is a robust version of the residual norm which, informally speaking, leverages not only the importance of a data point, but also how likely it behaves as an outlier.", "bibtex": "@InProceedings{pmlr-v162-zhu22i,\n title = \t {Residual-Based Sampling for Online Outlier-Robust {PCA}},\n author = {Zhu, Tianhao and Shen, Jie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27591--27611},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhu22i/zhu22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhu22i.html},\n abstract = \t {Outlier-robust principal component analysis (ORPCA) has been broadly applied in scientific discovery in the last decades. In this paper, we study online ORPCA, an important variant that addresses the practical challenge that the data points arrive in a sequential manner and the goal is to recover the underlying subspace of the clean data with one pass of the data. Our main contribution is the first provable algorithm that enjoys comparable recovery guarantee to the best known batch algorithm, while significantly improving upon the state-of-the-art online ORPCA algorithms. The core technique is a robust version of the residual norm which, informally speaking, leverages not only the importance of a data point, but also how likely it behaves as an outlier.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhu22i/zhu22i.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/zhu22i-supp.zip", "pdf_size": 463652, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:s8BVRZxCzk0J:scholar.google.com/&scioq=Residual-Based+Sampling+for+Online+Outlier-Robust+PCA&hl=en&as_sdt=0,33", "gs_version_total": 4, "aff": "Department of Computer Science, Stevens Institute of Technology, Hoboken, New Jersey, USA; Department of Computer Science, Stevens Institute of Technology, Hoboken, New Jersey, USA", "aff_domain": "stevens.edu;stevens.edu", "email": "stevens.edu;stevens.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/zhu22i.html", "aff_unique_index": "0;0", "aff_unique_norm": "Stevens Institute of Technology", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.stevens.edu", "aff_unique_abbr": "SIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hoboken", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Resilient and Communication Efficient Learning for Heterogeneous Federated Systems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16699", "id": "16699", "proceeding": "https://proceedings.mlr.press/v162/zhu22e.html", "poster": "/media/PosterPDFs/ICML%202022/59139a91a16c9b09a388091bdfe639de_rkTf9Yj.png?t=1657741212.3302102", "slides": "", "author_site": "Zhuangdi Zhu, Junyuan Hong, Steve Drew, Jiayu Zhou", "author": "Zhuangdi Zhu; Junyuan Hong; Steve Drew; Jiayu Zhou", "abstract": "The rise of Federated Learning (FL) is bringing machine learning to edge computing by utilizing data scattered across edge devices. However, the heterogeneity of edge network topologies and the uncertainty of wireless transmission are two major obstructions of FL\u2019s wide application in edge computing, leading to prohibitive convergence time and high communication cost. In this work, we propose an FL scheme to address both challenges simultaneously. Specifically, we enable edge devices to learn self-distilled neural networks that are readily prunable to arbitrary sizes, which capture the knowledge of the learning domain in a nested and progressive manner. Not only does our approach tackle system heterogeneity by serving edge devices with varying model architectures, but it also alleviates the issue of connection uncertainty by allowing transmitting part of the model parameters under faulty network connections, without wasting the contributing knowledge of the transmitted parameters. Extensive empirical studies show that under system heterogeneity and network instability, our approach demonstrates significant resilience and higher communication efficiency compared to the state-of-the-art.", "bibtex": "@InProceedings{pmlr-v162-zhu22e,\n title = \t {Resilient and Communication Efficient Learning for Heterogeneous Federated Systems},\n author = {Zhu, Zhuangdi and Hong, Junyuan and Drew, Steve and Zhou, Jiayu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27504--27526},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhu22e/zhu22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhu22e.html},\n abstract = \t {The rise of Federated Learning (FL) is bringing machine learning to edge computing by utilizing data scattered across edge devices. However, the heterogeneity of edge network topologies and the uncertainty of wireless transmission are two major obstructions of FL\u2019s wide application in edge computing, leading to prohibitive convergence time and high communication cost. In this work, we propose an FL scheme to address both challenges simultaneously. Specifically, we enable edge devices to learn self-distilled neural networks that are readily prunable to arbitrary sizes, which capture the knowledge of the learning domain in a nested and progressive manner. Not only does our approach tackle system heterogeneity by serving edge devices with varying model architectures, but it also alleviates the issue of connection uncertainty by allowing transmitting part of the model parameters under faulty network connections, without wasting the contributing knowledge of the transmitted parameters. Extensive empirical studies show that under system heterogeneity and network instability, our approach demonstrates significant resilience and higher communication efficiency compared to the state-of-the-art.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhu22e/zhu22e.pdf", "supp": "", "pdf_size": 24798048, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17302034368288201324&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science and Engineering, Michigan State University; Department of Computer Science and Engineering, Michigan State University; Department of Electrical and Software Engineering, University of Calgary; Department of Computer Science and Engineering, Michigan State University", "aff_domain": "msu.edu; ; ;msu.edu", "email": "msu.edu; ; ;msu.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhu22e.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Michigan State University;University of Calgary", "aff_unique_dep": "Department of Computer Science and Engineering;Department of Electrical and Software Engineering", "aff_unique_url": "https://www.msu.edu;https://www.ucalgary.ca", "aff_unique_abbr": "MSU;U of C", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "Restarted Nonconvex Accelerated Gradient Descent: No More Polylogarithmic Factor in the $O(\u03b5^-7/4)$ Complexity", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16489", "id": "16489", "proceeding": "https://proceedings.mlr.press/v162/li22o.html", "poster": "/media/PosterPDFs/ICML%202022/a9365bd906e11324065c35be476beb0c.png?t=1657166380.315585", "slides": "", "author_site": "Huan Li, Zhouchen Lin", "author": "Huan Li; Zhouchen Lin", "abstract": "This paper studies the accelerated gradient descent for general nonconvex problems under the gradient Lipschitz and Hessian Lipschitz assumptions. We establish that a simple restarted accelerated gradient descent (AGD) finds an $\\epsilon$-approximate first-order stationary point in $O(\\epsilon^{-7/4})$ gradient computations with simple proofs. Our complexity does not hide any polylogarithmic factors, and thus it improves over the state-of-the-art one by the $O(\\log\\frac{1}{\\epsilon})$ factor. Our simple algorithm only consists of Nesterov\u2019s classical AGD and a restart mechanism, and it does not need the negative curvature exploitation or the optimization of regularized surrogate functions. Technically, our simple proof does not invoke the analysis for the strongly convex AGD, which is crucial to remove the $O(\\log\\frac{1}{\\epsilon})$ factor.", "bibtex": "@InProceedings{pmlr-v162-li22o,\n title = \t {Restarted Nonconvex Accelerated Gradient Descent: No More Polylogarithmic Factor in the $O(\u03b5^{-7/4})$ Complexity},\n author = {Li, Huan and Lin, Zhouchen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12901--12916},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22o/li22o.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22o.html},\n abstract = \t {This paper studies the accelerated gradient descent for general nonconvex problems under the gradient Lipschitz and Hessian Lipschitz assumptions. We establish that a simple restarted accelerated gradient descent (AGD) finds an $\\epsilon$-approximate first-order stationary point in $O(\\epsilon^{-7/4})$ gradient computations with simple proofs. Our complexity does not hide any polylogarithmic factors, and thus it improves over the state-of-the-art one by the $O(\\log\\frac{1}{\\epsilon})$ factor. Our simple algorithm only consists of Nesterov\u2019s classical AGD and a restart mechanism, and it does not need the negative curvature exploitation or the optimization of regularized surrogate functions. Technically, our simple proof does not invoke the analysis for the strongly convex AGD, which is crucial to remove the $O(\\log\\frac{1}{\\epsilon})$ factor.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22o/li22o.pdf", "supp": "", "pdf_size": 757897, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6214400720936415091&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Institute of Robotics and Automatic Information Systems, College of Artificial Intelligence, Nankai University, Tianjin, China; Key Laboratory of Machine Perception, School of Artificial Intelligence, Peking University, Beijing, China+Institute for Artificial Intelligence, Peking University+Pazhou Lab, Guangzhou, China", "aff_domain": "nankai.edu.cn;pku.edu.cn", "email": "nankai.edu.cn;pku.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/li22o.html", "aff_unique_index": "0;1+1+2", "aff_unique_norm": "Nankai University;Peking University;Pazhou Lab", "aff_unique_dep": "Institute of Robotics and Automatic Information Systems, College of Artificial Intelligence;School of Artificial Intelligence;", "aff_unique_url": "http://www.nankai.edu.cn;http://www.pku.edu.cn;", "aff_unique_abbr": "Nankai;Peking U;", "aff_campus_unique_index": "0;1+3", "aff_campus_unique": "Tianjin;Beijing;;Guangzhou", "aff_country_unique_index": "0;0+0+0", "aff_country_unique": "China" }, { "title": "Rethinking Attention-Model Explainability through Faithfulness Violation Test", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18081", "id": "18081", "proceeding": "https://proceedings.mlr.press/v162/liu22i.html", "poster": "/media/PosterPDFs/ICML%202022/4c7a167bb329bd92580a99ce422d6fa6_24YDysG.png?t=1657364319.8803895", "slides": "/media/icml-2022/Slides/18081.pdf", "author_site": "Yibing Liu, Haoliang Li, Yangyang Guo, Chenqi KONG, Jing Li, Shiqi Wang", "author": "Yibing Liu; Haoliang Li; Yangyang Guo; Chenqi Kong; Jing Li; Shiqi Wang", "abstract": "Attention mechanisms are dominating the explainability of deep models. They produce probability distributions over the input, which are widely deemed as feature-importance indicators. However, in this paper, we find one critical limitation in attention explanations: weakness in identifying the polarity of feature impact. This would be somehow misleading \u2013 features with higher attention weights may not faithfully contribute to model predictions; instead, they can impose suppression effects. With this finding, we reflect on the explainability of current attention-based techniques, such as Attention $\\bigodot$ Gradient and LRP-based attention explanations. We first propose an actionable diagnostic methodology (henceforth faithfulness violation test) to measure the consistency between explanation weights and the impact polarity. Through the extensive experiments, we then show that most tested explanation methods are unexpectedly hindered by the faithfulness violation issue, especially the raw attention. Empirical analyses on the factors affecting violation issues further provide useful observations for adopting explanation methods in attention models.", "bibtex": "@InProceedings{pmlr-v162-liu22i,\n title = \t {Rethinking Attention-Model Explainability through Faithfulness Violation Test},\n author = {Liu, Yibing and Li, Haoliang and Guo, Yangyang and Kong, Chenqi and Li, Jing and Wang, Shiqi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13807--13824},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22i/liu22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22i.html},\n abstract = \t {Attention mechanisms are dominating the explainability of deep models. They produce probability distributions over the input, which are widely deemed as feature-importance indicators. However, in this paper, we find one critical limitation in attention explanations: weakness in identifying the polarity of feature impact. This would be somehow misleading \u2013 features with higher attention weights may not faithfully contribute to model predictions; instead, they can impose suppression effects. With this finding, we reflect on the explainability of current attention-based techniques, such as Attention $\\bigodot$ Gradient and LRP-based attention explanations. We first propose an actionable diagnostic methodology (henceforth faithfulness violation test) to measure the consistency between explanation weights and the impact polarity. Through the extensive experiments, we then show that most tested explanation methods are unexpectedly hindered by the faithfulness violation issue, especially the raw attention. Empirical analyses on the factors affecting violation issues further provide useful observations for adopting explanation methods in attention models.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22i/liu22i.pdf", "supp": "", "pdf_size": 1996317, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2225803020950336962&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "City University of Hong Kong; City University of Hong Kong; National University of Singapore; City University of Hong Kong; The Hong Kong Polytechnic University; City University of Hong Kong", "aff_domain": "gmail.com; ; ; ; ; ", "email": "gmail.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/liu22i.html", "aff_unique_index": "0;0;1;0;2;0", "aff_unique_norm": "City University of Hong Kong;National University of Singapore;Hong Kong Polytechnic University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cityu.edu.hk;https://www.nus.edu.sg;https://www.polyu.edu.hk", "aff_unique_abbr": "CityU;NUS;PolyU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "China;Singapore" }, { "title": "Rethinking Fano\u2019s Inequality in Ensemble Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17157", "id": "17157", "proceeding": "https://proceedings.mlr.press/v162/morishita22a.html", "poster": "/media/PosterPDFs/ICML%202022/ad61ab143223efbc24c7d2583be69251.png?t=1657808692.823125", "slides": "", "author_site": "Terufumi Morishita, Gaku Morio, Shota Horiguchi, Hiroaki Ozaki, Nobuo Nukaga", "author": "Terufumi Morishita; Gaku Morio; Shota Horiguchi; Hiroaki Ozaki; Nobuo Nukaga", "abstract": "We propose a fundamental theory on ensemble learning that evaluates a given ensemble system by a well-grounded set of metrics. Previous studies used a variant of Fano\u2019s inequality of information theory and derived a lower bound of the classification error rate on the basis of the accuracy and diversity of models. We revisit the original Fano\u2019s inequality and argue that the studies did not take into account the information lost when multiple model predictions are combined into a final prediction. To address this issue, we generalize the previous theory to incorporate the information loss. Further, we empirically validate and demonstrate the proposed theory through extensive experiments on actual systems. The theory reveals the strengths and weaknesses of systems on each metric, which will push the theoretical understanding of ensemble learning and give us insights into designing systems.", "bibtex": "@InProceedings{pmlr-v162-morishita22a,\n title = \t {Rethinking Fano\u2019s Inequality in Ensemble Learning},\n author = {Morishita, Terufumi and Morio, Gaku and Horiguchi, Shota and Ozaki, Hiroaki and Nukaga, Nobuo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15976--16016},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/morishita22a/morishita22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/morishita22a.html},\n abstract = \t {We propose a fundamental theory on ensemble learning that evaluates a given ensemble system by a well-grounded set of metrics. Previous studies used a variant of Fano\u2019s inequality of information theory and derived a lower bound of the classification error rate on the basis of the accuracy and diversity of models. We revisit the original Fano\u2019s inequality and argue that the studies did not take into account the information lost when multiple model predictions are combined into a final prediction. To address this issue, we generalize the previous theory to incorporate the information loss. Further, we empirically validate and demonstrate the proposed theory through extensive experiments on actual systems. The theory reveals the strengths and weaknesses of systems on each metric, which will push the theoretical understanding of ensemble learning and give us insights into designing systems.}\n}", "pdf": "https://proceedings.mlr.press/v162/morishita22a/morishita22a.pdf", "supp": "", "pdf_size": 2294357, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12629298542195730885&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Hitachi, Ltd. Research and Development Group, Kokubunji, Tokyo, Japan; Hitachi, Ltd. Research and Development Group, Kokubunji, Tokyo, Japan; Hitachi, Ltd. Research and Development Group, Kokubunji, Tokyo, Japan; Hitachi, Ltd. Research and Development Group, Kokubunji, Tokyo, Japan; Hitachi, Ltd. Research and Development Group, Kokubunji, Tokyo, Japan", "aff_domain": "hitachi.com; ; ; ; ", "email": "hitachi.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/morishita22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Hitachi, Ltd.", "aff_unique_dep": "Research and Development Group", "aff_unique_url": "https://www.hitachi.com", "aff_unique_abbr": "Hitachi", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Kokubunji", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Japan" }, { "title": "Rethinking Graph Neural Networks for Anomaly Detection", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17967", "id": "17967", "proceeding": "https://proceedings.mlr.press/v162/tang22b.html", "poster": "/media/PosterPDFs/ICML%202022/4e9cec1f583056459111d63e24f3b8ef.png?t=1657622971.7677677", "slides": "", "author_site": "Jianheng Tang, Jiajin Li, Ziqi Gao, Jia Li", "author": "Jianheng Tang; Jiajin Li; Ziqi Gao; Jia Li", "abstract": "Graph Neural Networks (GNNs) are widely applied for graph anomaly detection. As one of the key components for GNN design is to select a tailored spectral filter, we take the first step towards analyzing anomalies via the lens of the graph spectrum. Our crucial observation is the existence of anomalies will lead to the \u2018right-shift\u2019 phenomenon, that is, the spectral energy distribution concentrates less on low frequencies and more on high frequencies. This fact motivates us to propose the Beta Wavelet Graph Neural Network (BWGNN). Indeed, BWGNN has spectral and spatial localized band-pass filters to better handle the \u2018right-shift\u2019 phenomenon in anomalies. We demonstrate the effectiveness of BWGNN on four large-scale anomaly detection datasets. Our code and data are released at https://github.com/squareRoot3/Rethinking-Anomaly-Detection.", "bibtex": "@InProceedings{pmlr-v162-tang22b,\n title = \t {Rethinking Graph Neural Networks for Anomaly Detection},\n author = {Tang, Jianheng and Li, Jiajin and Gao, Ziqi and Li, Jia},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21076--21089},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tang22b/tang22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/tang22b.html},\n abstract = \t {Graph Neural Networks (GNNs) are widely applied for graph anomaly detection. As one of the key components for GNN design is to select a tailored spectral filter, we take the first step towards analyzing anomalies via the lens of the graph spectrum. Our crucial observation is the existence of anomalies will lead to the \u2018right-shift\u2019 phenomenon, that is, the spectral energy distribution concentrates less on low frequencies and more on high frequencies. This fact motivates us to propose the Beta Wavelet Graph Neural Network (BWGNN). Indeed, BWGNN has spectral and spatial localized band-pass filters to better handle the \u2018right-shift\u2019 phenomenon in anomalies. We demonstrate the effectiveness of BWGNN on four large-scale anomaly detection datasets. Our code and data are released at https://github.com/squareRoot3/Rethinking-Anomaly-Detection.}\n}", "pdf": "https://proceedings.mlr.press/v162/tang22b/tang22b.pdf", "supp": "", "pdf_size": 2989773, "gs_citation": 287, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15800828162221381866&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Hong Kong University of Science and Technology (Guangzhou)+Hong Kong University of Science and Technology; Stanford University; Hong Kong University of Science and Technology (Guangzhou)+Hong Kong University of Science and Technology; Hong Kong University of Science and Technology (Guangzhou)+Hong Kong University of Science and Technology", "aff_domain": "ust.hk;stanford.edu;ust.hk;ust.hk", "email": "ust.hk;stanford.edu;ust.hk;ust.hk", "github": "https://github.com/squareRoot3/Rethinking-Anomaly-Detection", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/tang22b.html", "aff_unique_index": "0+0;1;0+0;0+0", "aff_unique_norm": "Hong Kong University of Science and Technology;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ust.hk;https://www.stanford.edu", "aff_unique_abbr": "HKUST;Stanford", "aff_campus_unique_index": "0+0;1;0+0;0+0", "aff_campus_unique": "Hong Kong SAR;Stanford", "aff_country_unique_index": "0+0;1;0+0;0+0", "aff_country_unique": "China;United States" }, { "title": "Rethinking Image-Scaling Attacks: The Interplay Between Vulnerabilities in Machine Learning Systems", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16967", "id": "16967", "proceeding": "https://proceedings.mlr.press/v162/gao22g.html", "poster": "/media/PosterPDFs/ICML%202022/01ded4259d101feb739b06c399e9cd9c_qTaoypQ.png?t=1657861544.4421172", "slides": "", "author_site": "Yue Gao, Ilia Shumailov, Kassem Fawaz", "author": "Yue Gao; Ilia Shumailov; Kassem Fawaz", "abstract": "As real-world images come in varying sizes, the machine learning model is part of a larger system that includes an upstream image scaling algorithm. In this paper, we investigate the interplay between vulnerabilities of the image scaling procedure and machine learning models in the decision-based black-box setting. We propose a novel sampling strategy to make a black-box attack exploit vulnerabilities in scaling algorithms, scaling defenses, and the final machine learning model in an end-to-end manner. Based on this scaling-aware attack, we reveal that most existing scaling defenses are ineffective under threat from downstream models. Moreover, we empirically observe that standard black-box attacks can significantly improve their performance by exploiting the vulnerable scaling procedure. We further demonstrate this problem on a commercial Image Analysis API with decision-based black-box attacks.", "bibtex": "@InProceedings{pmlr-v162-gao22g,\n title = \t {Rethinking Image-Scaling Attacks: The Interplay Between Vulnerabilities in Machine Learning Systems},\n author = {Gao, Yue and Shumailov, Ilia and Fawaz, Kassem},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7102--7121},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gao22g/gao22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/gao22g.html},\n abstract = \t {As real-world images come in varying sizes, the machine learning model is part of a larger system that includes an upstream image scaling algorithm. In this paper, we investigate the interplay between vulnerabilities of the image scaling procedure and machine learning models in the decision-based black-box setting. We propose a novel sampling strategy to make a black-box attack exploit vulnerabilities in scaling algorithms, scaling defenses, and the final machine learning model in an end-to-end manner. Based on this scaling-aware attack, we reveal that most existing scaling defenses are ineffective under threat from downstream models. Moreover, we empirically observe that standard black-box attacks can significantly improve their performance by exploiting the vulnerable scaling procedure. We further demonstrate this problem on a commercial Image Analysis API with decision-based black-box attacks.}\n}", "pdf": "https://proceedings.mlr.press/v162/gao22g/gao22g.pdf", "supp": "", "pdf_size": 6514440, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9730023948978190760&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of Wisconsin\u2013Madison, Madison, WI, USA; Vector Institute, Toronto, ON, Canada; University of Wisconsin\u2013Madison, Madison, WI, USA", "aff_domain": "cs.wisc.edu; ; ", "email": "cs.wisc.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/gao22g.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Wisconsin\u2013Madison;Vector Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://vectorinstitute.ai/", "aff_unique_abbr": "UW\u2013Madison;Vector Institute", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Madison;Toronto", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "Retrieval-Augmented Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17147", "id": "17147", "proceeding": "https://proceedings.mlr.press/v162/goyal22a.html", "poster": "/media/PosterPDFs/ICML%202022/f77ad541b6b5bc48c47d814b95491fbd.png?t=1657947131.228635", "slides": "/media/icml-2022/Slides/17147.pdf", "author_site": "Anirudh Goyal, Abe Friesen Friesen, Andrea Banino, Theophane Weber, Nan Rosemary Ke, Adri\u00e0 Puigdomenech Badia, Arthur Guez, Mehdi Mirza, Peter Humphreys, Ksenia Konyushkova, Michal Valko, Simon Osindero, Timothy Lillicrap, Nicolas Heess, Charles Blundell", "author": "Anirudh Goyal; Abram Friesen; Andrea Banino; Theophane Weber; Nan Rosemary Ke; Adri\u00e0 Puigdom\u00e8nech Badia; Arthur Guez; Mehdi Mirza; Peter C Humphreys; Ksenia Konyushova; Michal Valko; Simon Osindero; Timothy Lillicrap; Nicolas Heess; Charles Blundell", "abstract": "Most deep reinforcement learning (RL) algorithms distill experience into parametric behavior policies or value functions via gradient updates. While effective, this approach has several disadvantages: (1) it is computationally expensive, (2) it can take many updates to integrate experiences into the parametric model, (3) experiences that are not fully integrated do not appropriately influence the agent\u2019s behavior, and (4) behavior is limited by the capacity of the model. In this paper we explore an alternative paradigm in which we train a network to map a dataset of past experiences to optimal behavior. Specifically, we augment an RL agent with a retrieval process (parameterized as a neural network) that has direct access to a dataset of experiences. This dataset can come from the agent\u2019s past experiences, expert demonstrations, or any other relevant source. The retrieval process is trained to retrieve information from the dataset that may be useful in the current context, to help the agent achieve its goal faster and more efficiently. The proposed method facilitates learning agents that at test time can condition their behavior on the entire dataset and not only the current state, or current trajectory. We integrate our method into two different RL agents: an offline DQN agent and an online R2D2 agent. In offline multi-task problems, we show that the retrieval-augmented DQN agent avoids task interference and learns faster than the baseline DQN agent. On Atari, we show that retrieval-augmented R2D2 learns significantly faster than the baseline R2D2 agent and achieves higher scores. We run extensive ablations to measure the contributions of the components of our proposed method.", "bibtex": "@InProceedings{pmlr-v162-goyal22a,\n title = \t {Retrieval-Augmented Reinforcement Learning},\n author = {Goyal, Anirudh and Friesen, Abram and Banino, Andrea and Weber, Theophane and Ke, Nan Rosemary and Badia, Adri{\\`a} Puigdom{\\`e}nech and Guez, Arthur and Mirza, Mehdi and Humphreys, Peter C and Konyushova, Ksenia and Valko, Michal and Osindero, Simon and Lillicrap, Timothy and Heess, Nicolas and Blundell, Charles},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7740--7765},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/goyal22a/goyal22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/goyal22a.html},\n abstract = \t {Most deep reinforcement learning (RL) algorithms distill experience into parametric behavior policies or value functions via gradient updates. While effective, this approach has several disadvantages: (1) it is computationally expensive, (2) it can take many updates to integrate experiences into the parametric model, (3) experiences that are not fully integrated do not appropriately influence the agent\u2019s behavior, and (4) behavior is limited by the capacity of the model. In this paper we explore an alternative paradigm in which we train a network to map a dataset of past experiences to optimal behavior. Specifically, we augment an RL agent with a retrieval process (parameterized as a neural network) that has direct access to a dataset of experiences. This dataset can come from the agent\u2019s past experiences, expert demonstrations, or any other relevant source. The retrieval process is trained to retrieve information from the dataset that may be useful in the current context, to help the agent achieve its goal faster and more efficiently. The proposed method facilitates learning agents that at test time can condition their behavior on the entire dataset and not only the current state, or current trajectory. We integrate our method into two different RL agents: an offline DQN agent and an online R2D2 agent. In offline multi-task problems, we show that the retrieval-augmented DQN agent avoids task interference and learns faster than the baseline DQN agent. On Atari, we show that retrieval-augmented R2D2 learns significantly faster than the baseline R2D2 agent and achieves higher scores. We run extensive ablations to measure the contributions of the components of our proposed method.}\n}", "pdf": "https://proceedings.mlr.press/v162/goyal22a/goyal22a.pdf", "supp": "", "pdf_size": 2994368, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11016479255634907533&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": ";;;;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;;;", "email": ";;;;;;;;;;;;;;", "github": "", "project": "", "author_num": 15, "oa": "https://proceedings.mlr.press/v162/goyal22a.html" }, { "title": "RetrievalGuard: Provably Robust 1-Nearest Neighbor Image Retrieval", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16217", "id": "16217", "proceeding": "https://proceedings.mlr.press/v162/wu22o.html", "poster": "/media/PosterPDFs/ICML%202022/24759597b99d21ef84e6c86c29e56aab.png?t=1657475070.6204824", "slides": "", "author_site": "Yihan Wu, Hongyang Zhang, Heng Huang", "author": "Yihan Wu; Hongyang Zhang; Heng Huang", "abstract": "Recent research works have shown that image retrieval models are vulnerable to adversarial attacks, where slightly modified test inputs could lead to problematic retrieval results. In this paper, we aim to design a provably robust image retrieval model which keeps the most important evaluation metric Recall@1 invariant to adversarial perturbation. We propose the first 1-nearest neighbor (NN) image retrieval algorithm, RetrievalGuard, which is provably robust against adversarial perturbations within an $\\ell_2$ ball of calculable radius. The challenge is to design a provably robust algorithm that takes into consideration the 1-NN search and the high-dimensional nature of the embedding space. Algorithmically, given a base retrieval model and a query sample, we build a smoothed retrieval model by carefully analyzing the 1-NN search procedure in the high-dimensional embedding space. We show that the smoothed retrieval model has bounded Lipschitz constant and thus the retrieval score is invariant to $\\ell_2$ adversarial perturbations. Experiments on on image retrieval tasks validate the robustness of our RetrievalGuard method.", "bibtex": "@InProceedings{pmlr-v162-wu22o,\n title = \t {{R}etrieval{G}uard: Provably Robust 1-Nearest Neighbor Image Retrieval},\n author = {Wu, Yihan and Zhang, Hongyang and Huang, Heng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24266--24279},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22o/wu22o.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22o.html},\n abstract = \t {Recent research works have shown that image retrieval models are vulnerable to adversarial attacks, where slightly modified test inputs could lead to problematic retrieval results. In this paper, we aim to design a provably robust image retrieval model which keeps the most important evaluation metric Recall@1 invariant to adversarial perturbation. We propose the first 1-nearest neighbor (NN) image retrieval algorithm, RetrievalGuard, which is provably robust against adversarial perturbations within an $\\ell_2$ ball of calculable radius. The challenge is to design a provably robust algorithm that takes into consideration the 1-NN search and the high-dimensional nature of the embedding space. Algorithmically, given a base retrieval model and a query sample, we build a smoothed retrieval model by carefully analyzing the 1-NN search procedure in the high-dimensional embedding space. We show that the smoothed retrieval model has bounded Lipschitz constant and thus the retrieval score is invariant to $\\ell_2$ adversarial perturbations. Experiments on on image retrieval tasks validate the robustness of our RetrievalGuard method.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22o/wu22o.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wu22o-supp.zip", "pdf_size": 443462, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12369656152822123792&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Electrical and Computer Engineering, University of Pittsburgh, USA+David R. Cheriton School of Computer Science, University of Waterloo, Canada; David R. Cheriton School of Computer Science, University of Waterloo, Canada; Department of Electrical and Computer Engineering, University of Pittsburgh, USA", "aff_domain": "pitt.edu;uwaterloo.ca;gmail.com", "email": "pitt.edu;uwaterloo.ca;gmail.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wu22o.html", "aff_unique_index": "0+1;1;0", "aff_unique_norm": "University of Pittsburgh;University of Waterloo", "aff_unique_dep": "Department of Electrical and Computer Engineering;David R. Cheriton School of Computer Science", "aff_unique_url": "https://www.pitt.edu;https://uwaterloo.ca", "aff_unique_abbr": "Pitt;UWaterloo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0", "aff_country_unique": "United States;Canada" }, { "title": "Retroformer: Pushing the Limits of End-to-end Retrosynthesis Transformer", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18129", "id": "18129", "proceeding": "https://proceedings.mlr.press/v162/wan22a.html", "poster": "/media/PosterPDFs/ICML%202022/096ffc299200f51751b08da6d865ae95.png?t=1657185465.9048996", "slides": "", "author_site": "Yue Wan, Chang-Yu (Kim) Hsieh, Ben Liao, Shengyu Zhang", "author": "Yue Wan; Chang-Yu Hsieh; Ben Liao; Shengyu Zhang", "abstract": "Retrosynthesis prediction is one of the fundamental challenges in organic synthesis. The task is to predict the reactants given a core product. With the advancement of machine learning, computer-aided synthesis planning has gained increasing interest. Numerous methods were proposed to solve this problem with different levels of dependency on additional chemical knowledge. In this paper, we propose Retroformer, a novel Transformer-based architecture for retrosynthesis prediction without relying on any cheminformatics tools for molecule editing. Via the proposed local attention head, the model can jointly encode the molecular sequence and graph, and efficiently exchange information between the local reactive region and the global reaction context. Retroformer reaches the new state-of-the-art accuracy for the end-to-end template-free retrosynthesis, and improves over many strong baselines on better molecule and reaction validity. In addition, its generative procedure is highly interpretable and controllable. Overall, Retroformer pushes the limits of the reaction reasoning ability of deep generative models.", "bibtex": "@InProceedings{pmlr-v162-wan22a,\n title = \t {Retroformer: Pushing the Limits of End-to-end Retrosynthesis Transformer},\n author = {Wan, Yue and Hsieh, Chang-Yu and Liao, Ben and Zhang, Shengyu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22475--22490},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wan22a/wan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/wan22a.html},\n abstract = \t {Retrosynthesis prediction is one of the fundamental challenges in organic synthesis. The task is to predict the reactants given a core product. With the advancement of machine learning, computer-aided synthesis planning has gained increasing interest. Numerous methods were proposed to solve this problem with different levels of dependency on additional chemical knowledge. In this paper, we propose Retroformer, a novel Transformer-based architecture for retrosynthesis prediction without relying on any cheminformatics tools for molecule editing. Via the proposed local attention head, the model can jointly encode the molecular sequence and graph, and efficiently exchange information between the local reactive region and the global reaction context. Retroformer reaches the new state-of-the-art accuracy for the end-to-end template-free retrosynthesis, and improves over many strong baselines on better molecule and reaction validity. In addition, its generative procedure is highly interpretable and controllable. Overall, Retroformer pushes the limits of the reaction reasoning ability of deep generative models.}\n}", "pdf": "https://proceedings.mlr.press/v162/wan22a/wan22a.pdf", "supp": "", "pdf_size": 5555125, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15256735878364760390&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Tencent Quantum Laboratory, Shenzhen, China; Tencent Quantum Laboratory, Shenzhen, China; Tencent Quantum Laboratory, Shenzhen, China; Tencent Quantum Laboratory, Shenzhen, China", "aff_domain": "tencent.com;tencent.com;tencent.com;tencent.com", "email": "tencent.com;tencent.com;tencent.com;tencent.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/wan22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tencent", "aff_unique_dep": "Quantum Computing", "aff_unique_url": "https://quantum.tencent.com", "aff_unique_abbr": "Tencent Quantum", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shenzhen", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Reverse Engineering $\\ell_p$ attacks: A block-sparse optimization approach with recovery guarantees", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16675", "id": "16675", "proceeding": "https://proceedings.mlr.press/v162/thaker22a.html", "poster": "", "slides": "", "author_site": "Darshan Thaker, Paris Giampouras, Rene Vidal", "author": "Darshan Thaker; Paris Giampouras; Rene Vidal", "abstract": "Deep neural network-based classifiers have been shown to be vulnerable to imperceptible perturbations to their input, such as $\\ell_p$-bounded norm adversarial attacks. This has motivated the development of many defense methods, which are then broken by new attacks, and so on. This paper focuses on a different but related problem of reverse engineering adversarial attacks. Specifically, given an attacked signal, we study conditions under which one can determine the type of attack ($\\ell_1$, $\\ell_2$ or $\\ell_\\infty$) and recover the clean signal. We pose this problem as a block-sparse recovery problem, where both the signal and the attack are assumed to lie in a union of subspaces that includes one subspace per class and one subspace per attack type. We derive geometric conditions on the subspaces under which any attacked signal can be decomposed as the sum of a clean signal plus an attack. In addition, by determining the subspaces that contain the signal and the attack, we can also classify the signal and determine the attack type. Experiments on digit and face classification demonstrate the effectiveness of the proposed approach.", "bibtex": "@InProceedings{pmlr-v162-thaker22a,\n title = \t {Reverse Engineering $\\ell_p$ attacks: A block-sparse optimization approach with recovery guarantees},\n author = {Thaker, Darshan and Giampouras, Paris and Vidal, Rene},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21253--21271},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/thaker22a/thaker22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/thaker22a.html},\n abstract = \t {Deep neural network-based classifiers have been shown to be vulnerable to imperceptible perturbations to their input, such as $\\ell_p$-bounded norm adversarial attacks. This has motivated the development of many defense methods, which are then broken by new attacks, and so on. This paper focuses on a different but related problem of reverse engineering adversarial attacks. Specifically, given an attacked signal, we study conditions under which one can determine the type of attack ($\\ell_1$, $\\ell_2$ or $\\ell_\\infty$) and recover the clean signal. We pose this problem as a block-sparse recovery problem, where both the signal and the attack are assumed to lie in a union of subspaces that includes one subspace per class and one subspace per attack type. We derive geometric conditions on the subspaces under which any attacked signal can be decomposed as the sum of a clean signal plus an attack. In addition, by determining the subspaces that contain the signal and the attack, we can also classify the signal and determine the attack type. Experiments on digit and face classification demonstrate the effectiveness of the proposed approach.}\n}", "pdf": "https://proceedings.mlr.press/v162/thaker22a/thaker22a.pdf", "supp": "", "pdf_size": 803519, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14253494845022222792&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Mathematical Institute for Data Science, Johns Hopkins University, Baltimore, MD USA; Mathematical Institute for Data Science, Johns Hopkins University, Baltimore, MD USA; Mathematical Institute for Data Science, Johns Hopkins University, Baltimore, MD USA", "aff_domain": "jhu.edu;jhu.edu; ", "email": "jhu.edu;jhu.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/thaker22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "Mathematical Institute for Data Science", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Baltimore", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Reverse Engineering the Neural Tangent Kernel", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17143", "id": "17143", "proceeding": "https://proceedings.mlr.press/v162/simon22a.html", "poster": "/media/PosterPDFs/ICML%202022/b20bb95ab626d93fd976af958fbc61ba.png?t=1657210375.6259217", "slides": "", "author_site": "James B Simon, Sajant Anand, Michael R DeWeese", "author": "James Benjamin Simon; Sajant Anand; Mike Deweese", "abstract": "The development of methods to guide the design of neural networks is an important open challenge for deep learning theory. As a paradigm for principled neural architecture design, we propose the translation of high-performing kernels, which are better-understood and amenable to first-principles design, into equivalent network architectures, which have superior efficiency, flexibility, and feature learning. To this end, we constructively prove that, with just an appropriate choice of activation function, any positive-semidefinite dot-product kernel can be realized as either the NNGP or neural tangent kernel of a fully-connected neural network with only one hidden layer. We verify our construction numerically and demonstrate its utility as a design tool for finite fully-connected networks in several experiments.", "bibtex": "@InProceedings{pmlr-v162-simon22a,\n title = \t {Reverse Engineering the Neural Tangent Kernel},\n author = {Simon, James Benjamin and Anand, Sajant and Deweese, Mike},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20215--20231},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/simon22a/simon22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/simon22a.html},\n abstract = \t {The development of methods to guide the design of neural networks is an important open challenge for deep learning theory. As a paradigm for principled neural architecture design, we propose the translation of high-performing kernels, which are better-understood and amenable to first-principles design, into equivalent network architectures, which have superior efficiency, flexibility, and feature learning. To this end, we constructively prove that, with just an appropriate choice of activation function, any positive-semidefinite dot-product kernel can be realized as either the NNGP or neural tangent kernel of a fully-connected neural network with only one hidden layer. We verify our construction numerically and demonstrate its utility as a design tool for finite fully-connected networks in several experiments.}\n}", "pdf": "https://proceedings.mlr.press/v162/simon22a/simon22a.pdf", "supp": "", "pdf_size": 685000, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17490938514076901442&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Physics, University of California, Berkeley; Department of Physics, University of California, Berkeley; Department of Physics, University of California, Berkeley + Redwood Center for Theoretical Neuroscience and Helen Wills Neuroscience Institute, University of California, Berkeley", "aff_domain": "berkeley.edu; ; ", "email": "berkeley.edu; ; ", "github": "https://github.com/james-simon/reverse-engineering", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/simon22a.html", "aff_unique_index": "0;0;0+0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Department of Physics", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0+0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0+0", "aff_country_unique": "United States" }, { "title": "Revisiting Consistency Regularization for Deep Partial Label Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17809", "id": "17809", "proceeding": "https://proceedings.mlr.press/v162/wu22l.html", "poster": "/media/PosterPDFs/ICML%202022/dc6a7e655d7e5840e66733e9ee67cc69.png?t=1657375768.8451424", "slides": "", "author_site": "Dong-Dong Wu, Deng-Bao Wang, Min-Ling Zhang", "author": "Dong-Dong Wu; Deng-Bao Wang; Min-Ling Zhang", "abstract": "Partial label learning (PLL), which refers to the classification task where each training instance is ambiguously annotated with a set of candidate labels, has been recently studied in deep learning paradigm. Despite advances in recent deep PLL literature, existing methods (e.g., methods based on self-training or contrastive learning) are confronted with either ineffectiveness or inefficiency. In this paper, we revisit a simple idea namely consistency regularization, which has been shown effective in traditional PLL literature, to guide the training of deep models. Towards this goal, a new regularized training framework, which performs supervised learning on non-candidate labels and employs consistency regularization on candidate labels, is proposed for PLL. We instantiate the regularization term by matching the outputs of multiple augmentations of an instance to a conformal label distribution, which can be adaptively inferred by the closed-form solution. Experiments on benchmark datasets demonstrate the superiority of the proposed method compared with other state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v162-wu22l,\n title = \t {Revisiting Consistency Regularization for Deep Partial Label Learning},\n author = {Wu, Dong-Dong and Wang, Deng-Bao and Zhang, Min-Ling},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24212--24225},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22l/wu22l.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22l.html},\n abstract = \t {Partial label learning (PLL), which refers to the classification task where each training instance is ambiguously annotated with a set of candidate labels, has been recently studied in deep learning paradigm. Despite advances in recent deep PLL literature, existing methods (e.g., methods based on self-training or contrastive learning) are confronted with either ineffectiveness or inefficiency. In this paper, we revisit a simple idea namely consistency regularization, which has been shown effective in traditional PLL literature, to guide the training of deep models. Towards this goal, a new regularized training framework, which performs supervised learning on non-candidate labels and employs consistency regularization on candidate labels, is proposed for PLL. We instantiate the regularization term by matching the outputs of multiple augmentations of an instance to a conformal label distribution, which can be adaptively inferred by the closed-form solution. Experiments on benchmark datasets demonstrate the superiority of the proposed method compared with other state-of-the-art methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22l/wu22l.pdf", "supp": "", "pdf_size": 1073369, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16173932876388728358&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "School of Computer Science and Engineering, Southeast University, Nanjing 210096, China+Key Laboratory of Computer Network and Information Integration (Southeast University), Ministry of Education, China; School of Computer Science and Engineering, Southeast University, Nanjing 210096, China+Key Laboratory of Computer Network and Information Integration (Southeast University), Ministry of Education, China; School of Computer Science and Engineering, Southeast University, Nanjing 210096, China+Key Laboratory of Computer Network and Information Integration (Southeast University), Ministry of Education, China", "aff_domain": "seu.edu.cn;seu.edu.cn;seu.edu.cn", "email": "seu.edu.cn;seu.edu.cn;seu.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wu22l.html", "aff_unique_index": "0+0;0+0;0+0", "aff_unique_norm": "Southeast University", "aff_unique_dep": "School of Computer Science and Engineering", "aff_unique_url": "https://www.seu.edu.cn/", "aff_unique_abbr": "SEU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Nanjing;", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "China" }, { "title": "Revisiting Contrastive Learning through the Lens of Neighborhood Component Analysis: an Integrated Framework", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16999", "id": "16999", "proceeding": "https://proceedings.mlr.press/v162/ko22a.html", "poster": "/media/PosterPDFs/ICML%202022/dde16b86c64390f0af01275a44d3a42d.png?t=1658095480.015184", "slides": "", "author_site": "Ching-Yun (Irene) Ko, Jeet Mohapatra, Sijia Liu, Pin-Yu Chen, Luca Daniel, Lily Weng", "author": "Ching-Yun Ko; Jeet Mohapatra; Sijia Liu; Pin-Yu Chen; Luca Daniel; Lily Weng", "abstract": "As a seminal tool in self-supervised representation learning, contrastive learning has gained unprecedented attention in recent years. In essence, contrastive learning aims to leverage pairs of positive and negative samples for representation learning, which relates to exploiting neighborhood information in a feature space. By investigating the connection between contrastive learning and neighborhood component analysis (NCA), we provide a novel stochastic nearest neighbor viewpoint of contrastive learning and subsequently propose a series of contrastive losses that outperform the existing ones. Under our proposed framework, we show a new methodology to design integrated contrastive losses that could simultaneously achieve good accuracy and robustness on downstream tasks. With the integrated framework, we achieve up to 6% improvement on the standard accuracy and 17% improvement on the robust accuracy.", "bibtex": "@InProceedings{pmlr-v162-ko22a,\n title = \t {Revisiting Contrastive Learning through the Lens of Neighborhood Component Analysis: an Integrated Framework},\n author = {Ko, Ching-Yun and Mohapatra, Jeet and Liu, Sijia and Chen, Pin-Yu and Daniel, Luca and Weng, Lily},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11387--11412},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ko22a/ko22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ko22a.html},\n abstract = \t {As a seminal tool in self-supervised representation learning, contrastive learning has gained unprecedented attention in recent years. In essence, contrastive learning aims to leverage pairs of positive and negative samples for representation learning, which relates to exploiting neighborhood information in a feature space. By investigating the connection between contrastive learning and neighborhood component analysis (NCA), we provide a novel stochastic nearest neighbor viewpoint of contrastive learning and subsequently propose a series of contrastive losses that outperform the existing ones. Under our proposed framework, we show a new methodology to design integrated contrastive losses that could simultaneously achieve good accuracy and robustness on downstream tasks. With the integrated framework, we achieve up to 6% improvement on the standard accuracy and 17% improvement on the robust accuracy.}\n}", "pdf": "https://proceedings.mlr.press/v162/ko22a/ko22a.pdf", "supp": "", "pdf_size": 1246430, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6422514382686875606&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "MIT; MIT; MSU; IBM Research AI; MIT; UCSD", "aff_domain": "mit.edu; ; ; ; ; ", "email": "mit.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/ko22a.html", "aff_unique_index": "0;0;1;2;0;3", "aff_unique_norm": "Massachusetts Institute of Technology;Michigan State University;IBM;University of California, San Diego", "aff_unique_dep": ";;AI;", "aff_unique_url": "https://web.mit.edu;https://www.msu.edu;https://www.ibm.com/research;https://ucsd.edu", "aff_unique_abbr": "MIT;MSU;IBM;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";La Jolla", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Revisiting End-to-End Speech-to-Text Translation From Scratch", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17531", "id": "17531", "proceeding": "https://proceedings.mlr.press/v162/zhang22i.html", "poster": "/media/PosterPDFs/ICML%202022/cfbc6c5cfb8a3e10fab12aa3512153df_foZ1Fe0.png?t=1656949104.6737676", "slides": "/media/icml-2022/Slides/17531.pdf", "author_site": "Biao Zhang, Barry Haddow, Rico Sennrich", "author": "Biao Zhang; Barry Haddow; Rico Sennrich", "abstract": "End-to-end (E2E) speech-to-text translation (ST) often depends on pretraining its encoder and/or decoder using source transcripts via speech recognition or text translation tasks, without which translation performance drops substantially. However, transcripts are not always available, and how significant such pretraining is for E2E ST has rarely been studied in the literature. In this paper, we revisit this question and explore the extent to which the quality of E2E ST trained on speech-translation pairs alone can be improved. We reexamine several techniques proven beneficial to ST previously, and offer a set of best practices that biases a Transformer-based E2E ST system toward training from scratch. Besides, we propose parameterized distance penalty to facilitate the modeling of locality in the self-attention model for speech. On four benchmarks covering 23 languages, our experiments show that, without using any transcripts or pretraining, the proposed system reaches and even outperforms previous studies adopting pretraining, although the gap remains in (extremely) low-resource settings. Finally, we discuss neural acoustic feature modeling, where a neural model is designed to extract acoustic features from raw speech signals directly, with the goal to simplify inductive biases and add freedom to the model in describing speech. For the first time, we demonstrate its feasibility and show encouraging results on ST tasks.", "bibtex": "@InProceedings{pmlr-v162-zhang22i,\n title = \t {Revisiting End-to-End Speech-to-Text Translation From Scratch},\n author = {Zhang, Biao and Haddow, Barry and Sennrich, Rico},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26193--26205},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22i/zhang22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22i.html},\n abstract = \t {End-to-end (E2E) speech-to-text translation (ST) often depends on pretraining its encoder and/or decoder using source transcripts via speech recognition or text translation tasks, without which translation performance drops substantially. However, transcripts are not always available, and how significant such pretraining is for E2E ST has rarely been studied in the literature. In this paper, we revisit this question and explore the extent to which the quality of E2E ST trained on speech-translation pairs alone can be improved. We reexamine several techniques proven beneficial to ST previously, and offer a set of best practices that biases a Transformer-based E2E ST system toward training from scratch. Besides, we propose parameterized distance penalty to facilitate the modeling of locality in the self-attention model for speech. On four benchmarks covering 23 languages, our experiments show that, without using any transcripts or pretraining, the proposed system reaches and even outperforms previous studies adopting pretraining, although the gap remains in (extremely) low-resource settings. Finally, we discuss neural acoustic feature modeling, where a neural model is designed to extract acoustic features from raw speech signals directly, with the goal to simplify inductive biases and add freedom to the model in describing speech. For the first time, we demonstrate its feasibility and show encouraging results on ST tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22i/zhang22i.pdf", "supp": "", "pdf_size": 464929, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1521111115547925534&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "School of Informatics, University of Edinburgh; School of Informatics, University of Edinburgh; Department of Computational Linguistics, University of Zurich", "aff_domain": "ed.ac.uk; ; ", "email": "ed.ac.uk; ; ", "github": "https://github.com/bzhangGo/zero", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhang22i.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Edinburgh;University of Zurich", "aff_unique_dep": "School of Informatics;Department of Computational Linguistics", "aff_unique_url": "https://www.ed.ac.uk;https://www.unizh.ch", "aff_unique_abbr": "Edinburgh;UZH", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Edinburgh;", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;Switzerland" }, { "title": "Revisiting Label Smoothing and Knowledge Distillation Compatibility: What was Missing?", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18229", "id": "18229", "proceeding": "https://proceedings.mlr.press/v162/chandrasegaran22a.html", "poster": "/media/PosterPDFs/ICML%202022/1437751a77305a0c4c8d44127bd8c285_2T2e9Ks.png?t=1657522642.2054756", "slides": "", "author_site": "Keshigeyan Chandrasegaran, Ngoc-Trung Tran, Yunqing Zhao, Ngai-Man Cheung", "author": "Keshigeyan Chandrasegaran; Ngoc-Trung Tran; Yunqing Zhao; Ngai-Man Cheung", "abstract": "This work investigates the compatibility between label smoothing (LS) and knowledge distillation (KD). Contemporary findings addressing this thesis statement take dichotomous standpoints: Muller et al. (2019) and Shen et al. (2021b). Critically, there is no effort to understand and resolve these contradictory findings, leaving the primal question \\text{-} to smooth or not to smooth a teacher network? \\text{-} unanswered. The main contributions of our work are the discovery, analysis and validation of systematic diffusion as the missing concept which is instrumental in understanding and resolving these contradictory findings. This systematic diffusion essentially curtails the benefits of distilling from an LS-trained teacher, thereby rendering KD at increased temperatures ineffective. Our discovery is comprehensively supported by large-scale experiments, analyses and case studies including image classification, neural machine translation and compact student distillation tasks spanning across multiple datasets and teacher-student architectures. Based on our analysis, we suggest practitioners to use an LS-trained teacher with a low-temperature transfer to achieve high performance students. Code and models are available at https://keshik6.github.io/revisiting-ls-kd-compatibility/", "bibtex": "@InProceedings{pmlr-v162-chandrasegaran22a,\n title = \t {Revisiting Label Smoothing and Knowledge Distillation Compatibility: What was Missing?},\n author = {Chandrasegaran, Keshigeyan and Tran, Ngoc-Trung and Zhao, Yunqing and Cheung, Ngai-Man},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2890--2916},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chandrasegaran22a/chandrasegaran22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chandrasegaran22a.html},\n abstract = \t {This work investigates the compatibility between label smoothing (LS) and knowledge distillation (KD). Contemporary findings addressing this thesis statement take dichotomous standpoints: Muller et al. (2019) and Shen et al. (2021b). Critically, there is no effort to understand and resolve these contradictory findings, leaving the primal question \\text{-} to smooth or not to smooth a teacher network? \\text{-} unanswered. The main contributions of our work are the discovery, analysis and validation of systematic diffusion as the missing concept which is instrumental in understanding and resolving these contradictory findings. This systematic diffusion essentially curtails the benefits of distilling from an LS-trained teacher, thereby rendering KD at increased temperatures ineffective. Our discovery is comprehensively supported by large-scale experiments, analyses and case studies including image classification, neural machine translation and compact student distillation tasks spanning across multiple datasets and teacher-student architectures. Based on our analysis, we suggest practitioners to use an LS-trained teacher with a low-temperature transfer to achieve high performance students. Code and models are available at https://keshik6.github.io/revisiting-ls-kd-compatibility/}\n}", "pdf": "https://proceedings.mlr.press/v162/chandrasegaran22a/chandrasegaran22a.pdf", "supp": "", "pdf_size": 5556467, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7014741791819212008&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Singapore University of Technology and Design (SUTD); Singapore University of Technology and Design (SUTD); Singapore University of Technology and Design (SUTD); Singapore University of Technology and Design (SUTD)", "aff_domain": "sutd.edu.sg;sutd.edu.sg;sutd.edu.sg;sutd.edu.sg", "email": "sutd.edu.sg;sutd.edu.sg;sutd.edu.sg;sutd.edu.sg", "github": "https://github.com/keshik6/revisiting-ls-kd-compatibility", "project": "https://keshik6.github.io/revisiting-ls-kd-compatibility/", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/chandrasegaran22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Singapore University of Technology and Design", "aff_unique_dep": "", "aff_unique_url": "https://www.sutd.edu.sg", "aff_unique_abbr": "SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Singapore" }, { "title": "Revisiting Online Submodular Minimization: Gap-Dependent Regret Bounds, Best of Both Worlds and Adversarial Robustness", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17559", "id": "17559", "proceeding": "https://proceedings.mlr.press/v162/ito22a.html", "poster": "/media/PosterPDFs/ICML%202022/78daab4f4fce94374d8a53444c77c2c6.png?t=1658229518.2790604", "slides": "", "author": "Shinji Ito", "abstract": "In this paper, we consider online decision problems with submodular loss functions. For such problems, existing studies have only dealt with worst-case analysis. This study goes beyond worst-case analysis to show instance-dependent regret bounds. More precisely, for each of the full-information and bandit-feedback settings, we propose an algorithm that achieves a gap-dependent O(log T)-regret bound in the stochastic environment and is comparable to the best existing algorithm in the adversarial environment. The proposed algorithms also work well in the stochastic environment with adversarial corruptions, which is an intermediate setting between the stochastic and adversarial environments.", "bibtex": "@InProceedings{pmlr-v162-ito22a,\n title = \t {Revisiting Online Submodular Minimization: Gap-Dependent Regret Bounds, Best of Both Worlds and Adversarial Robustness},\n author = {Ito, Shinji},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9678--9694},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ito22a/ito22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ito22a.html},\n abstract = \t {In this paper, we consider online decision problems with submodular loss functions. For such problems, existing studies have only dealt with worst-case analysis. This study goes beyond worst-case analysis to show instance-dependent regret bounds. More precisely, for each of the full-information and bandit-feedback settings, we propose an algorithm that achieves a gap-dependent O(log T)-regret bound in the stochastic environment and is comparable to the best existing algorithm in the adversarial environment. The proposed algorithms also work well in the stochastic environment with adversarial corruptions, which is an intermediate setting between the stochastic and adversarial environments.}\n}", "pdf": "https://proceedings.mlr.press/v162/ito22a/ito22a.pdf", "supp": "", "pdf_size": 324986, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10873833412524763437&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "NEC Corporation, Tokyo, Japan", "aff_domain": "nec.com", "email": "nec.com", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/ito22a.html", "aff_unique_index": "0", "aff_unique_norm": "NEC Corporation", "aff_unique_dep": "", "aff_unique_url": "https://www.nec.com", "aff_unique_abbr": "NEC", "aff_campus_unique_index": "0", "aff_campus_unique": "Tokyo", "aff_country_unique_index": "0", "aff_country_unique": "Japan" }, { "title": "Revisiting Some Common Practices in Cooperative Multi-Agent Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17517", "id": "17517", "proceeding": "https://proceedings.mlr.press/v162/fu22d.html", "poster": "/media/PosterPDFs/ICML%202022/59ab3ba90ae4b4ab84fe69de7b8e3f5f_1NWFtbx.png?t=1656747539.183223", "slides": "", "author_site": "Wei Fu, Chao Yu, Zelai Xu, Jiaqi Yang, Yi Wu", "author": "Wei Fu; Chao Yu; Zelai Xu; Jiaqi Yang; Yi Wu", "abstract": "Many advances in cooperative multi-agent reinforcement learning (MARL) are based on two common design principles: value decomposition and parameter sharing. A typical MARL algorithm of this fashion decomposes a centralized Q-function into local Q-networks with parameters shared across agents. Such an algorithmic paradigm enables centralized training and decentralized execution (CTDE) and leads to efficient learning in practice. Despite all the advantages, we revisit these two principles and show that in certain scenarios, e.g., environments with a highly multi-modal reward landscape, value decomposition, and parameter sharing can be problematic and lead to undesired outcomes. In contrast, policy gradient (PG) methods with individual policies provably converge to an optimal solution in these cases, which partially supports some recent empirical observations that PG can be effective in many MARL testbeds. Inspired by our theoretical analysis, we present practical suggestions on implementing multi-agent PG algorithms for either high rewards or diverse emergent behaviors and empirically validate our findings on a variety of domains, ranging from the simplified matrix and grid-world games to complex benchmarks such as StarCraft Multi-Agent Challenge and Google Research Football. We hope our insights could benefit the community towards developing more general and more powerful MARL algorithms.", "bibtex": "@InProceedings{pmlr-v162-fu22d,\n title = \t {Revisiting Some Common Practices in Cooperative Multi-Agent Reinforcement Learning},\n author = {Fu, Wei and Yu, Chao and Xu, Zelai and Yang, Jiaqi and Wu, Yi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6863--6877},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fu22d/fu22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/fu22d.html},\n abstract = \t {Many advances in cooperative multi-agent reinforcement learning (MARL) are based on two common design principles: value decomposition and parameter sharing. A typical MARL algorithm of this fashion decomposes a centralized Q-function into local Q-networks with parameters shared across agents. Such an algorithmic paradigm enables centralized training and decentralized execution (CTDE) and leads to efficient learning in practice. Despite all the advantages, we revisit these two principles and show that in certain scenarios, e.g., environments with a highly multi-modal reward landscape, value decomposition, and parameter sharing can be problematic and lead to undesired outcomes. In contrast, policy gradient (PG) methods with individual policies provably converge to an optimal solution in these cases, which partially supports some recent empirical observations that PG can be effective in many MARL testbeds. Inspired by our theoretical analysis, we present practical suggestions on implementing multi-agent PG algorithms for either high rewards or diverse emergent behaviors and empirically validate our findings on a variety of domains, ranging from the simplified matrix and grid-world games to complex benchmarks such as StarCraft Multi-Agent Challenge and Google Research Football. We hope our insights could benefit the community towards developing more general and more powerful MARL algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/fu22d/fu22d.pdf", "supp": "", "pdf_size": 7593889, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11720250016377494534&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University, China; Department of Electronics Engineering, Tsinghua University, China; Department of Electronics Engineering, Tsinghua University, China; Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, CA, USA; Shanghai Qi Zhi Institute, China", "aff_domain": "gmail.com; ; ; ;gmail.com", "email": "gmail.com; ; ; ;gmail.com", "github": "", "project": "https://sites.google.com/view/revisiting-marl", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/fu22d.html", "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Tsinghua University;University of California, Berkeley;Shanghai Qi Zhi Institute", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;Department of Electrical Engineering and Computer Sciences;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.berkeley.edu;", "aff_unique_abbr": "Tsinghua;UC Berkeley;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Revisiting and Advancing Fast Adversarial Training Through The Lens of Bi-Level Optimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17271", "id": "17271", "proceeding": "https://proceedings.mlr.press/v162/zhang22ak.html", "poster": "/media/PosterPDFs/ICML%202022/778609db5dc7e1a8315717a9cdd8fd6f.png?t=1657334700.3166273", "slides": "", "author_site": "Yihua Zhang, Guanhua Zhang, Prashant Khanduri, Mingyi Hong, Shiyu Chang, Sijia Liu", "author": "Yihua Zhang; Guanhua Zhang; Prashant Khanduri; Mingyi Hong; Shiyu Chang; Sijia Liu", "abstract": "Adversarial training (AT) is a widely recognized defense mechanism to gain the robustness of deep neural networks against adversarial attacks. It is built on min-max optimization (MMO), where the minimizer (i.e., defender) seeks a robust model to minimize the worst-case training loss in the presence of adversarial examples crafted by the maximizer (i.e., attacker). However, the conventional MMO method makes AT hard to scale. Thus, Fast-AT and other recent algorithms attempt to simplify MMO by replacing its maximization step with the single gradient sign-based attack generation step. Although easy to implement, FAST-AT lacks theoretical guarantees, and its empirical performance is unsatisfactory due to the issue of robust catastrophic overfitting when training with strong adversaries. In this paper, we advance Fast-AT from the fresh perspective of bi-level optimization (BLO). We first show that the commonly-used Fast-AT is equivalent to using a stochastic gradient algorithm to solve a linearized BLO problem involving a sign operation. However, the discrete nature of the sign operation makes it difficult to understand the algorithm performance. Inspired by BLO, we design and analyze a new set of robust training algorithms termed Fast Bi-level AT (Fast-BAT), which effectively defends sign-based projected gradient descent (PGD) attacks without using any gradient sign method or explicit robust regularization. In practice, we show that our method yields substantial robustness improvements over multiple baselines across multiple models and datasets.", "bibtex": "@InProceedings{pmlr-v162-zhang22ak,\n title = \t {Revisiting and Advancing Fast Adversarial Training Through The Lens of Bi-Level Optimization},\n author = {Zhang, Yihua and Zhang, Guanhua and Khanduri, Prashant and Hong, Mingyi and Chang, Shiyu and Liu, Sijia},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26693--26712},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22ak/zhang22ak.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22ak.html},\n abstract = \t {Adversarial training (AT) is a widely recognized defense mechanism to gain the robustness of deep neural networks against adversarial attacks. It is built on min-max optimization (MMO), where the minimizer (i.e., defender) seeks a robust model to minimize the worst-case training loss in the presence of adversarial examples crafted by the maximizer (i.e., attacker). However, the conventional MMO method makes AT hard to scale. Thus, Fast-AT and other recent algorithms attempt to simplify MMO by replacing its maximization step with the single gradient sign-based attack generation step. Although easy to implement, FAST-AT lacks theoretical guarantees, and its empirical performance is unsatisfactory due to the issue of robust catastrophic overfitting when training with strong adversaries. In this paper, we advance Fast-AT from the fresh perspective of bi-level optimization (BLO). We first show that the commonly-used Fast-AT is equivalent to using a stochastic gradient algorithm to solve a linearized BLO problem involving a sign operation. However, the discrete nature of the sign operation makes it difficult to understand the algorithm performance. Inspired by BLO, we design and analyze a new set of robust training algorithms termed Fast Bi-level AT (Fast-BAT), which effectively defends sign-based projected gradient descent (PGD) attacks without using any gradient sign method or explicit robust regularization. In practice, we show that our method yields substantial robustness improvements over multiple baselines across multiple models and datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22ak/zhang22ak.pdf", "supp": "", "pdf_size": 1084527, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13176476866209995495&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Michigan State University; UC Santa Barbara; University of Minnesota; University of Minnesota; UC Santa Barbara; Michigan State University+MIT-IBM Watson AI Lab, IBM Research", "aff_domain": "msu.edu;ucsb.edu; ; ; ; ", "email": "msu.edu;ucsb.edu; ; ; ; ", "github": "https://github.com/OPTML-Group/Fast-BAT", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhang22ak.html", "aff_unique_index": "0;1;2;2;1;0+3", "aff_unique_norm": "Michigan State University;University of California, Santa Barbara;University of Minnesota;IBM", "aff_unique_dep": ";;;AI Lab", "aff_unique_url": "https://www.msu.edu;https://www.ucsb.edu;https://www.minnesota.edu;https://www.ibmwatsonai.org/", "aff_unique_abbr": "MSU;UCSB;UMN;MIT-IBM AI Lab", "aff_campus_unique_index": "1;1;", "aff_campus_unique": ";Santa Barbara", "aff_country_unique_index": "0;0;0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "Revisiting the Effects of Stochasticity for Hamiltonian Samplers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16165", "id": "16165", "proceeding": "https://proceedings.mlr.press/v162/franzese22a.html", "poster": "/media/PosterPDFs/ICML%202022/8420d359404024567b5aefda1231af24.png?t=1655992558.8206344", "slides": "/media/icml-2022/Slides/16165.pdf", "author_site": "Giulio Franzese, Dimitrios Milios, Maurizio Filippone, Pietro Michiardi", "author": "Giulio Franzese; Dimitrios Milios; Maurizio Filippone; Pietro Michiardi", "abstract": "We revisit the theoretical properties of Hamiltonian stochastic differential equations (SDES) for Bayesian posterior sampling, and we study the two types of errors that arise from numerical SDE simulation: the discretization error and the error due to noisy gradient estimates in the context of data subsampling. Our main result is a novel analysis for the effect of mini-batches through the lens of differential operator splitting, revising previous literature results. The stochastic component of a Hamiltonian SDE is decoupled from the gradient noise, for which we make no normality assumptions. This leads to the identification of a convergence bottleneck: when considering mini-batches, the best achievable error rate is $\\mathcal{O}(\\eta^2)$, with $\\eta$ being the integrator step size. Our theoretical results are supported by an empirical study on a variety of regression and classification tasks for Bayesian neural networks.", "bibtex": "@InProceedings{pmlr-v162-franzese22a,\n title = \t {Revisiting the Effects of Stochasticity for {H}amiltonian Samplers},\n author = {Franzese, Giulio and Milios, Dimitrios and Filippone, Maurizio and Michiardi, Pietro},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6744--6778},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/franzese22a/franzese22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/franzese22a.html},\n abstract = \t {We revisit the theoretical properties of Hamiltonian stochastic differential equations (SDES) for Bayesian posterior sampling, and we study the two types of errors that arise from numerical SDE simulation: the discretization error and the error due to noisy gradient estimates in the context of data subsampling. Our main result is a novel analysis for the effect of mini-batches through the lens of differential operator splitting, revising previous literature results. The stochastic component of a Hamiltonian SDE is decoupled from the gradient noise, for which we make no normality assumptions. This leads to the identification of a convergence bottleneck: when considering mini-batches, the best achievable error rate is $\\mathcal{O}(\\eta^2)$, with $\\eta$ being the integrator step size. Our theoretical results are supported by an empirical study on a variety of regression and classification tasks for Bayesian neural networks.}\n}", "pdf": "https://proceedings.mlr.press/v162/franzese22a/franzese22a.pdf", "supp": "", "pdf_size": 1504660, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13522089628083770135&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Data Science Department, EURECOM, France; Data Science Department, EURECOM, France; Data Science Department, EURECOM, France; Data Science Department, EURECOM, France", "aff_domain": "eurecom.fr; ; ; ", "email": "eurecom.fr; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/franzese22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "EURECOM", "aff_unique_dep": "Data Science Department", "aff_unique_url": "https://www.eurecom.fr", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "Reward-Free RL is No Harder Than Reward-Aware RL in Linear Markov Decision Processes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16445", "id": "16445", "proceeding": "https://proceedings.mlr.press/v162/wagenmaker22b.html", "poster": "/media/PosterPDFs/ICML%202022/f621585df244e9596dc70a39b579efb1.png?t=1657626065.3953896", "slides": "", "author_site": "Andrew Wagenmaker, Yifang Chen, Max Simchowitz, Simon Du, Kevin Jamieson", "author": "Andrew J Wagenmaker; Yifang Chen; Max Simchowitz; Simon Du; Kevin Jamieson", "abstract": "Reward-free reinforcement learning (RL) considers the setting where the agent does not have access to a reward function during exploration, but must propose a near-optimal policy for an arbitrary reward function revealed only after exploring. In the the tabular setting, it is well known that this is a more difficult problem than reward-aware (PAC) RL\u2014where the agent has access to the reward function during exploration\u2014with optimal sample complexities in the two settings differing by a factor of $|\\mathcal{S}|$, the size of the state space. We show that this separation does not exist in the setting of linear MDPs. We first develop a computationally efficient algorithm for reward-free RL in a $d$-dimensional linear MDP with sample complexity scaling as $\\widetilde{\\mathcal{O}}(d^2 H^5/\\epsilon^2)$. We then show a lower bound with matching dimension-dependence of $\\Omega(d^2 H^2/\\epsilon^2)$, which holds for the reward-aware RL setting. To our knowledge, our approach is the first computationally efficient algorithm to achieve optimal $d$ dependence in linear MDPs, even in the single-reward PAC setting. Our algorithm relies on a novel procedure which efficiently traverses a linear MDP, collecting samples in any given \u201cfeature direction\u201d, and enjoys a sample complexity scaling optimally in the (linear MDP equivalent of the) maximal state visitation probability. We show that this exploration procedure can also be applied to solve the problem of obtaining \u201cwell-conditioned\u201d covariates in linear MDPs.", "bibtex": "@InProceedings{pmlr-v162-wagenmaker22b,\n title = \t {Reward-Free {RL} is No Harder Than Reward-Aware {RL} in Linear {M}arkov Decision Processes},\n author = {Wagenmaker, Andrew J and Chen, Yifang and Simchowitz, Max and Du, Simon and Jamieson, Kevin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22430--22456},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wagenmaker22b/wagenmaker22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/wagenmaker22b.html},\n abstract = \t {Reward-free reinforcement learning (RL) considers the setting where the agent does not have access to a reward function during exploration, but must propose a near-optimal policy for an arbitrary reward function revealed only after exploring. In the the tabular setting, it is well known that this is a more difficult problem than reward-aware (PAC) RL\u2014where the agent has access to the reward function during exploration\u2014with optimal sample complexities in the two settings differing by a factor of $|\\mathcal{S}|$, the size of the state space. We show that this separation does not exist in the setting of linear MDPs. We first develop a computationally efficient algorithm for reward-free RL in a $d$-dimensional linear MDP with sample complexity scaling as $\\widetilde{\\mathcal{O}}(d^2 H^5/\\epsilon^2)$. We then show a lower bound with matching dimension-dependence of $\\Omega(d^2 H^2/\\epsilon^2)$, which holds for the reward-aware RL setting. To our knowledge, our approach is the first computationally efficient algorithm to achieve optimal $d$ dependence in linear MDPs, even in the single-reward PAC setting. Our algorithm relies on a novel procedure which efficiently traverses a linear MDP, collecting samples in any given \u201cfeature direction\u201d, and enjoys a sample complexity scaling optimally in the (linear MDP equivalent of the) maximal state visitation probability. We show that this exploration procedure can also be applied to solve the problem of obtaining \u201cwell-conditioned\u201d covariates in linear MDPs.}\n}", "pdf": "https://proceedings.mlr.press/v162/wagenmaker22b/wagenmaker22b.pdf", "supp": "", "pdf_size": 491223, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6394521892415618554&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle; Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle; CSAIL, MIT, Cambridge, MA; Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle; Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle", "aff_domain": "cs.washington.edu; ; ; ; ", "email": "cs.washington.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/wagenmaker22b.html", "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "University of Washington;Massachusetts Institute of Technology", "aff_unique_dep": "Paul G. Allen School of Computer Science and Engineering;Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.washington.edu;https://www.csail.mit.edu", "aff_unique_abbr": "UW;MIT", "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Seattle;Cambridge", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Rich Feature Construction for the Optimization-Generalization Dilemma", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16599", "id": "16599", "proceeding": "https://proceedings.mlr.press/v162/zhang22u.html", "poster": "/media/PosterPDFs/ICML%202022/8fd7f981e10b41330b618129afcaab2d_HZaSb1Z.png?t=1658076465.799044", "slides": "/media/icml-2022/Slides/16599.pdf", "author_site": "Jianyu Zhang, David Lopez-Paz, L\u00e9on Bottou", "author": "Jianyu Zhang; David Lopez-Paz; Leon Bottou", "abstract": "There often is a dilemma between ease of optimization and robust out-of-distribution (OoD) generalization. For instance, many OoD methods rely on penalty terms whose optimization is challenging. They are either too strong to optimize reliably or too weak to achieve their goals. We propose to initialize the networks with a rich representation containing a palette of potentially useful features, ready to be used by even simple models. On the one hand, a rich representation provides a good initialization for the optimizer. On the other hand, it also provides an inductive bias that helps OoD generalization. Such a representation is constructed with the Rich Feature Construction (RFC) algorithm, also called the Bonsai algorithm, which consists of a succession of training episodes. During discovery episodes, we craft a multi-objective optimization criterion and its associated datasets in a manner that prevents the network from using the features constructed in the previous iterations. During synthesis episodes, we use knowledge distillation to force the network to simultaneously represent all the previously discovered features. Initializing the networks with Bonsai representations consistently helps six OoD methods achieve top performance on ColoredMNIST benchmark. The same technique substantially outperforms comparable results on the Wilds Camelyon17 task, eliminates the high result variance that plagues other methods, and makes hyperparameter tuning and model selection more reliable.", "bibtex": "@InProceedings{pmlr-v162-zhang22u,\n title = \t {Rich Feature Construction for the Optimization-Generalization Dilemma},\n author = {Zhang, Jianyu and Lopez-Paz, David and Bottou, Leon},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26397--26411},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22u/zhang22u.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22u.html},\n abstract = \t {There often is a dilemma between ease of optimization and robust out-of-distribution (OoD) generalization. For instance, many OoD methods rely on penalty terms whose optimization is challenging. They are either too strong to optimize reliably or too weak to achieve their goals. We propose to initialize the networks with a rich representation containing a palette of potentially useful features, ready to be used by even simple models. On the one hand, a rich representation provides a good initialization for the optimizer. On the other hand, it also provides an inductive bias that helps OoD generalization. Such a representation is constructed with the Rich Feature Construction (RFC) algorithm, also called the Bonsai algorithm, which consists of a succession of training episodes. During discovery episodes, we craft a multi-objective optimization criterion and its associated datasets in a manner that prevents the network from using the features constructed in the previous iterations. During synthesis episodes, we use knowledge distillation to force the network to simultaneously represent all the previously discovered features. Initializing the networks with Bonsai representations consistently helps six OoD methods achieve top performance on ColoredMNIST benchmark. The same technique substantially outperforms comparable results on the Wilds Camelyon17 task, eliminates the high result variance that plagues other methods, and makes hyperparameter tuning and model selection more reliable.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22u/zhang22u.pdf", "supp": "", "pdf_size": 1034254, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4651591858912243934&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "New York University, New York, NY, USA; Facebook AI Research, Paris, France; Facebook AI Research, New York, NY, USA", "aff_domain": "nyu.edu; ; ", "email": "nyu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhang22u.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "New York University;Meta", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.nyu.edu;https://research.facebook.com", "aff_unique_abbr": "NYU;FAIR", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "New York;Paris", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;France" }, { "title": "RieszNet and ForestRiesz: Automatic Debiased Machine Learning with Neural Nets and Random Forests", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16311", "id": "16311", "proceeding": "https://proceedings.mlr.press/v162/chernozhukov22a.html", "poster": "/media/PosterPDFs/ICML%202022/99e7e6ce097324aceb45f98299ceb621.png?t=1657468716.6704414", "slides": "/media/icml-2022/Slides/16311.pdf", "author_site": "Victor Chernozhukov, Whitney Newey, V\u00edctor Quintas-Mart\u00ednez, Vasilis Syrgkanis", "author": "Victor Chernozhukov; Whitney Newey; V\u0131\u0301ctor M Quintas-Mart\u0131\u0301nez; Vasilis Syrgkanis", "abstract": "Many causal and policy effects of interest are defined by linear functionals of high-dimensional or non-parametric regression functions. $\\sqrt{n}$-consistent and asymptotically normal estimation of the object of interest requires debiasing to reduce the effects of regularization and/or model selection on the object of interest. Debiasing is typically achieved by adding a correction term to the plug-in estimator of the functional, which leads to properties such as semi-parametric efficiency, double robustness, and Neyman orthogonality. We implement an automatic debiasing procedure based on automatically learning the Riesz representation of the linear functional using Neural Nets and Random Forests. Our method only relies on black-box evaluation oracle access to the linear functional and does not require knowledge of its analytic form. We propose a multitasking Neural Net debiasing method with stochastic gradient descent minimization of a combined Riesz representer and regression loss, while sharing representation layers for the two functions. We also propose a Random Forest method which learns a locally linear representation of the Riesz function. Even though our method applies to arbitrary functionals, we experimentally find that it performs well compared to the state of art neural net based algorithm of Shi et al. (2019) for the case of the average treatment effect functional. We also evaluate our method on the problem of estimating average marginal effects with continuous treatments, using semi-synthetic data of gasoline price changes on gasoline demand.", "bibtex": "@InProceedings{pmlr-v162-chernozhukov22a,\n title = \t {{R}iesz{N}et and {F}orest{R}iesz: Automatic Debiased Machine Learning with Neural Nets and Random Forests},\n author = {Chernozhukov, Victor and Newey, Whitney and Quintas-Mart\\'{\\i}nez, V\\'{\\i}ctor M and Syrgkanis, Vasilis},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3901--3914},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chernozhukov22a/chernozhukov22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chernozhukov22a.html},\n abstract = \t {Many causal and policy effects of interest are defined by linear functionals of high-dimensional or non-parametric regression functions. $\\sqrt{n}$-consistent and asymptotically normal estimation of the object of interest requires debiasing to reduce the effects of regularization and/or model selection on the object of interest. Debiasing is typically achieved by adding a correction term to the plug-in estimator of the functional, which leads to properties such as semi-parametric efficiency, double robustness, and Neyman orthogonality. We implement an automatic debiasing procedure based on automatically learning the Riesz representation of the linear functional using Neural Nets and Random Forests. Our method only relies on black-box evaluation oracle access to the linear functional and does not require knowledge of its analytic form. We propose a multitasking Neural Net debiasing method with stochastic gradient descent minimization of a combined Riesz representer and regression loss, while sharing representation layers for the two functions. We also propose a Random Forest method which learns a locally linear representation of the Riesz function. Even though our method applies to arbitrary functionals, we experimentally find that it performs well compared to the state of art neural net based algorithm of Shi et al. (2019) for the case of the average treatment effect functional. We also evaluate our method on the problem of estimating average marginal effects with continuous treatments, using semi-synthetic data of gasoline price changes on gasoline demand.}\n}", "pdf": "https://proceedings.mlr.press/v162/chernozhukov22a/chernozhukov22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chernozhukov22a-supp.zip", "pdf_size": 395364, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9961128829212907766&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Economics, Massachusetts Institute of Technology, Cambridge MA, USA+Microsoft Research New England, Cambridge MA, USA; Department of Economics, Massachusetts Institute of Technology, Cambridge MA, USA+Microsoft Research New England, Cambridge MA, USA; Department of Economics, Massachusetts Institute of Technology, Cambridge MA, USA+Microsoft Research New England, Cambridge MA, USA; Microsoft Research New England, Cambridge MA, USA", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "github.com/victor5as/RieszLearning", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/chernozhukov22a.html", "aff_unique_index": "0+1;0+1;0+1;1", "aff_unique_norm": "Massachusetts Institute of Technology;Microsoft", "aff_unique_dep": "Department of Economics;Microsoft Research New England", "aff_unique_url": "https://web.mit.edu;https://www.microsoft.com/en-us/research/group/new-england", "aff_unique_abbr": "MIT;MSR NE", "aff_campus_unique_index": "0+0;0+0;0+0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0+0;0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Ripple Attention for Visual Perception with Sub-quadratic Complexity", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16149", "id": "16149", "proceeding": "https://proceedings.mlr.press/v162/zheng22a.html", "poster": "", "slides": "/media/icml-2022/Slides/16149.pdf", "author_site": "Lin Zheng, Huijie Pan, Lingpeng Kong", "author": "Lin Zheng; Huijie Pan; Lingpeng Kong", "abstract": "Transformer architectures are now central to sequence modeling tasks. At its heart is the attention mechanism, which enables effective modeling of long-term dependencies in a sequence. Recently, transformers have been successfully applied in the computer vision domain, where 2D images are first segmented into patches and then treated as 1D sequences. Such linearization, however, impairs the notion of spatial locality in images, which bears important visual clues. To bridge the gap, we propose", "bibtex": "@InProceedings{pmlr-v162-zheng22a,\n title = \t {Ripple Attention for Visual Perception with Sub-quadratic Complexity},\n author = {Zheng, Lin and Pan, Huijie and Kong, Lingpeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26993--27010},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zheng22a/zheng22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zheng22a.html},\n abstract = \t {Transformer architectures are now central to sequence modeling tasks. At its heart is the attention mechanism, which enables effective modeling of long-term dependencies in a sequence. Recently, transformers have been successfully applied in the computer vision domain, where 2D images are first segmented into patches and then treated as 1D sequences. Such linearization, however, impairs the notion of spatial locality in images, which bears important visual clues. To bridge the gap, we propose", "pdf": "https://proceedings.mlr.press/v162/zheng22a/zheng22a.pdf", "supp": "", "pdf_size": 540854, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8391912899448350799&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, The University of Hong Kong; Department of Computer Science, The University of Hong Kong; Department of Computer Science, The University of Hong Kong + Shanghai Artificial Intelligence Laboratory", "aff_domain": "connect.hku.hk; ; ", "email": "connect.hku.hk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zheng22a.html", "aff_unique_index": "0;0;0+1", "aff_unique_norm": "University of Hong Kong;Shanghai Artificial Intelligence Laboratory", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.hku.hk;http://www.shailab.org/", "aff_unique_abbr": "HKU;Shanghai AI Lab", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0+0", "aff_country_unique": "China" }, { "title": "Risk-Averse No-Regret Learning in Online Convex Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16587", "id": "16587", "proceeding": "https://proceedings.mlr.press/v162/wang22w.html", "poster": "/media/PosterPDFs/ICML%202022/198dd5fb9c43b2d29a548f8c77e85cf9.png?t=1657674919.2217913", "slides": "", "author_site": "Zifan Wang, Yi Shen, Michael Zavlanos", "author": "Zifan Wang; Yi Shen; Michael Zavlanos", "abstract": "We consider an online stochastic game with risk-averse agents whose goal is to learn optimal decisions that minimize the risk of incurring significantly high costs. Specifically, we use the Conditional Value at Risk (CVaR) as a risk measure that the agents can estimate using bandit feedback in the form of the cost values of only their selected actions. Since the distributions of the cost functions depend on the actions of all agents that are generally unobservable, they are themselves unknown and, therefore, the CVaR values of the costs are difficult to compute. To address this challenge, we propose a new online risk-averse learning algorithm that relies on one-point zeroth-order estimation of the CVaR gradients computed using CVaR values that are estimated by appropriately sampling the cost functions. We show that this algorithm achieves sub-linear regret with high probability. We also propose two variants of this algorithm that improve performance. The first variant relies on a new sampling strategy that uses samples from the previous iteration to improve the estimation accuracy of the CVaR values. The second variant employs residual feedback that uses CVaR values from the previous iteration to reduce the variance of the CVaR gradient estimates. We theoretically analyze the convergence properties of these variants and illustrate their performance on an online market problem that we model as a Cournot game.", "bibtex": "@InProceedings{pmlr-v162-wang22w,\n title = \t {Risk-Averse No-Regret Learning in Online Convex Games},\n author = {Wang, Zifan and Shen, Yi and Zavlanos, Michael},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22999--23017},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22w/wang22w.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22w.html},\n abstract = \t {We consider an online stochastic game with risk-averse agents whose goal is to learn optimal decisions that minimize the risk of incurring significantly high costs. Specifically, we use the Conditional Value at Risk (CVaR) as a risk measure that the agents can estimate using bandit feedback in the form of the cost values of only their selected actions. Since the distributions of the cost functions depend on the actions of all agents that are generally unobservable, they are themselves unknown and, therefore, the CVaR values of the costs are difficult to compute. To address this challenge, we propose a new online risk-averse learning algorithm that relies on one-point zeroth-order estimation of the CVaR gradients computed using CVaR values that are estimated by appropriately sampling the cost functions. We show that this algorithm achieves sub-linear regret with high probability. We also propose two variants of this algorithm that improve performance. The first variant relies on a new sampling strategy that uses samples from the previous iteration to improve the estimation accuracy of the CVaR values. The second variant employs residual feedback that uses CVaR values from the previous iteration to reduce the variance of the CVaR gradient estimates. We theoretically analyze the convergence properties of these variants and illustrate their performance on an online market problem that we model as a Cournot game.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22w/wang22w.pdf", "supp": "", "pdf_size": 812076, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5422161613045691610&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "School of Electrical Engineering and Computer Science, KTH Royal Institute of Technology, Stockholm, Sweden; Department of Mechanical Engineering & Material Science, Duke University, Durham, NC 27708, USA; Department of Mechanical Engineering & Material Science, Duke University, Durham, NC 27708, USA", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22w.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "KTH Royal Institute of Technology;Duke University", "aff_unique_dep": "School of Electrical Engineering and Computer Science;Department of Mechanical Engineering & Material Science", "aff_unique_url": "https://www.kth.se;https://www.duke.edu", "aff_unique_abbr": "KTH;Duke", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Stockholm;Durham", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Sweden;United States" }, { "title": "Robin Hood and Matthew Effects: Differential Privacy Has Disparate Impact on Synthetic Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18247", "id": "18247", "proceeding": "https://proceedings.mlr.press/v162/ganev22a.html", "poster": "/media/PosterPDFs/ICML%202022/19e21d13715b9720d8c00977145f1dd8.png?t=1657566964.940918", "slides": "/media/icml-2022/Slides/18247.pdf", "author_site": "Georgi Ganev, Bristena Oprisanu, Emiliano De Cristofaro", "author": "Georgi Ganev; Bristena Oprisanu; Emiliano De Cristofaro", "abstract": "Generative models trained with Differential Privacy (DP) can be used to generate synthetic data while minimizing privacy risks. We analyze the impact of DP on these models vis-a-vis underrepresented classes/subgroups of data, specifically, studying: 1) the size of classes/subgroups in the synthetic data, and 2) the accuracy of classification tasks run on them. We also evaluate the effect of various levels of imbalance and privacy budgets. Our analysis uses three state-of-the-art DP models (PrivBayes, DP-WGAN, and PATE-GAN) and shows that DP yields opposite size distributions in the generated synthetic data. It affects the gap between the majority and minority classes/subgroups; in some cases by reducing it (a \"Robin Hood\" effect) and, in others, by increasing it (a \"Matthew\" effect). Either way, this leads to (similar) disparate impacts on the accuracy of classification tasks on the synthetic data, affecting disproportionately more the underrepresented subparts of the data. Consequently, when training models on synthetic data, one might incur the risk of treating different subpopulations unevenly, leading to unreliable or unfair conclusions.", "bibtex": "@InProceedings{pmlr-v162-ganev22a,\n title = \t {Robin Hood and Matthew Effects: Differential Privacy Has Disparate Impact on Synthetic Data},\n author = {Ganev, Georgi and Oprisanu, Bristena and De Cristofaro, Emiliano},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6944--6959},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ganev22a/ganev22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ganev22a.html},\n abstract = \t {Generative models trained with Differential Privacy (DP) can be used to generate synthetic data while minimizing privacy risks. We analyze the impact of DP on these models vis-a-vis underrepresented classes/subgroups of data, specifically, studying: 1) the size of classes/subgroups in the synthetic data, and 2) the accuracy of classification tasks run on them. We also evaluate the effect of various levels of imbalance and privacy budgets. Our analysis uses three state-of-the-art DP models (PrivBayes, DP-WGAN, and PATE-GAN) and shows that DP yields opposite size distributions in the generated synthetic data. It affects the gap between the majority and minority classes/subgroups; in some cases by reducing it (a \"Robin Hood\" effect) and, in others, by increasing it (a \"Matthew\" effect). Either way, this leads to (similar) disparate impacts on the accuracy of classification tasks on the synthetic data, affecting disproportionately more the underrepresented subparts of the data. Consequently, when training models on synthetic data, one might incur the risk of treating different subpopulations unevenly, leading to unreliable or unfair conclusions.}\n}", "pdf": "https://proceedings.mlr.press/v162/ganev22a/ganev22a.pdf", "supp": "", "pdf_size": 1985914, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10922006937846894420&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "University College London, London, UK+Hazy, London, UK; University College London, London, UK; University College London, London, UK", "aff_domain": "ucl.ac.uk; ; ", "email": "ucl.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/ganev22a.html", "aff_unique_index": "0+1;0;0", "aff_unique_norm": "University College London;Hazy", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucl.ac.uk;", "aff_unique_abbr": "UCL;", "aff_campus_unique_index": "0+0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Robust Counterfactual Explanations for Tree-Based Ensembles", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16963", "id": "16963", "proceeding": "https://proceedings.mlr.press/v162/dutta22a.html", "poster": "/media/PosterPDFs/ICML%202022/fccc64972a9468a11f125cadb090e89e_emQy5gU.png?t=1657944555.5767674", "slides": "/media/icml-2022/Slides/16963.pdf", "author_site": "Sanghamitra Dutta, Jason Long, Saumitra Mishra, Cecilia Tilli, Daniele Magazzeni", "author": "Sanghamitra Dutta; Jason Long; Saumitra Mishra; Cecilia Tilli; Daniele Magazzeni", "abstract": "Counterfactual explanations inform ways to achieve a desired outcome from a machine learning model. However, such explanations are not robust to certain real-world changes in the underlying model (e.g., retraining the model, changing hyperparameters, etc.), questioning their reliability in several applications, e.g., credit lending. In this work, we propose a novel strategy - that we call RobX - to generate robust counterfactuals for tree-based ensembles, e.g., XGBoost. Tree-based ensembles pose additional challenges in robust counterfactual generation, e.g., they have a non-smooth and non-differentiable objective function, and they can change a lot in the parameter space under retraining on very similar data. We first introduce a novel metric - that we call Counterfactual Stability - that attempts to quantify how robust a counterfactual is going to be to model changes under retraining, and comes with desirable theoretical properties. Our proposed strategy RobX works with any counterfactual generation method (base method) and searches for robust counterfactuals by iteratively refining the counterfactual generated by the base method using our metric Counterfactual Stability. We compare the performance of RobX with popular counterfactual generation methods (for tree-based ensembles) across benchmark datasets. The results demonstrate that our strategy generates counterfactuals that are significantly more robust (nearly 100% validity after actual model changes) and also realistic (in terms of local outlier factor) over existing state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v162-dutta22a,\n title = \t {Robust Counterfactual Explanations for Tree-Based Ensembles},\n author = {Dutta, Sanghamitra and Long, Jason and Mishra, Saumitra and Tilli, Cecilia and Magazzeni, Daniele},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5742--5756},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dutta22a/dutta22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dutta22a.html},\n abstract = \t {Counterfactual explanations inform ways to achieve a desired outcome from a machine learning model. However, such explanations are not robust to certain real-world changes in the underlying model (e.g., retraining the model, changing hyperparameters, etc.), questioning their reliability in several applications, e.g., credit lending. In this work, we propose a novel strategy - that we call RobX - to generate robust counterfactuals for tree-based ensembles, e.g., XGBoost. Tree-based ensembles pose additional challenges in robust counterfactual generation, e.g., they have a non-smooth and non-differentiable objective function, and they can change a lot in the parameter space under retraining on very similar data. We first introduce a novel metric - that we call Counterfactual Stability - that attempts to quantify how robust a counterfactual is going to be to model changes under retraining, and comes with desirable theoretical properties. Our proposed strategy RobX works with any counterfactual generation method (base method) and searches for robust counterfactuals by iteratively refining the counterfactual generated by the base method using our metric Counterfactual Stability. We compare the performance of RobX with popular counterfactual generation methods (for tree-based ensembles) across benchmark datasets. The results demonstrate that our strategy generates counterfactuals that are significantly more robust (nearly 100% validity after actual model changes) and also realistic (in terms of local outlier factor) over existing state-of-the-art methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/dutta22a/dutta22a.pdf", "supp": "", "pdf_size": 708303, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2501507414986297355&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "JP Morgan Chase AI Research; JP Morgan Chase AI Research; JP Morgan Chase AI Research; JP Morgan Chase AI Research; JP Morgan Chase AI Research", "aff_domain": "gmail.com; ; ; ; ", "email": "gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/dutta22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "JP Morgan Chase", "aff_unique_dep": "AI Research", "aff_unique_url": "https://www.jpmorganchase.com", "aff_unique_abbr": "JPM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Robust Deep Reinforcement Learning through Bootstrapped Opportunistic Curriculum", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16757", "id": "16757", "proceeding": "https://proceedings.mlr.press/v162/wu22k.html", "poster": "/media/PosterPDFs/ICML%202022/a1afc58c6ca9540d057299ec3016d726_MTMhRJT.png?t=1656953125.8281488", "slides": "/media/icml-2022/Slides/16757.pdf", "author_site": "Junlin Wu, Yevgeniy Vorobeychik", "author": "Junlin Wu; Yevgeniy Vorobeychik", "abstract": "Despite considerable advances in deep reinforcement learning, it has been shown to be highly vulnerable to adversarial perturbations to state observations. Recent efforts that have attempted to improve adversarial robustness of reinforcement learning can nevertheless tolerate only very small perturbations, and remain fragile as perturbation size increases. We propose Bootstrapped Opportunistic Adversarial Curriculum Learning (BCL), a novel flexible adversarial curriculum learning framework for robust reinforcement learning. Our framework combines two ideas: conservatively bootstrapping each curriculum phase with highest quality solutions obtained from multiple runs of the previous phase, and opportunistically skipping forward in the curriculum. In our experiments we show that the proposed BCL framework enables dramatic improvements in robustness of learned policies to adversarial perturbations. The greatest improvement is for Pong, where our framework yields robustness to perturbations of up to 25/255; in contrast, the best existing approach can only tolerate adversarial noise up to 5/255. Our code is available at: https://github.com/jlwu002/BCL.", "bibtex": "@InProceedings{pmlr-v162-wu22k,\n title = \t {Robust Deep Reinforcement Learning through Bootstrapped Opportunistic Curriculum},\n author = {Wu, Junlin and Vorobeychik, Yevgeniy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24177--24211},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22k/wu22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22k.html},\n abstract = \t {Despite considerable advances in deep reinforcement learning, it has been shown to be highly vulnerable to adversarial perturbations to state observations. Recent efforts that have attempted to improve adversarial robustness of reinforcement learning can nevertheless tolerate only very small perturbations, and remain fragile as perturbation size increases. We propose Bootstrapped Opportunistic Adversarial Curriculum Learning (BCL), a novel flexible adversarial curriculum learning framework for robust reinforcement learning. Our framework combines two ideas: conservatively bootstrapping each curriculum phase with highest quality solutions obtained from multiple runs of the previous phase, and opportunistically skipping forward in the curriculum. In our experiments we show that the proposed BCL framework enables dramatic improvements in robustness of learned policies to adversarial perturbations. The greatest improvement is for Pong, where our framework yields robustness to perturbations of up to 25/255; in contrast, the best existing approach can only tolerate adversarial noise up to 5/255. Our code is available at: https://github.com/jlwu002/BCL.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22k/wu22k.pdf", "supp": "", "pdf_size": 426664, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6530213985097280080&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science and Engineering, Washington University in St. Louis, St. Louis, MO, USA; Department of Computer Science and Engineering, Washington University in St. Louis, St. Louis, MO, USA", "aff_domain": "wustl.edu;wustl.edu", "email": "wustl.edu;wustl.edu", "github": "https://github.com/jlwu002/BCL", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/wu22k.html", "aff_unique_index": "0;0", "aff_unique_norm": "Washington University in St. Louis", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://wustl.edu", "aff_unique_abbr": "WashU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "St. Louis", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Robust Fine-Tuning of Deep Neural Networks with Hessian-based Generalization Guarantees", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17933", "id": "17933", "proceeding": "https://proceedings.mlr.press/v162/ju22a.html", "poster": "", "slides": "", "author_site": "Haotian Ju, Dongyue Li, Hongyang Zhang", "author": "Haotian Ju; Dongyue Li; Hongyang R Zhang", "abstract": "We consider transfer learning approaches that fine-tune a pretrained deep neural network on a target task. We investigate generalization properties of fine-tuning to understand the problem of overfitting, which often happens in practice. Previous works have shown that constraining the distance from the initialization of fine-tuning improves generalization. Using a PAC-Bayesian analysis, we observe that besides distance from initialization, Hessians affect generalization through the noise stability of deep neural networks against noise injections. Motivated by the observation, we develop Hessian distance-based generalization bounds for a wide range of fine-tuning methods. Next, we investigate the robustness of fine-tuning with noisy labels. We design an algorithm that incorporates consistent losses and distance-based regularization for fine-tuning. Additionally, we prove a generalization error bound of our algorithm under class conditional independent noise in the training dataset labels. We perform a detailed empirical study of our algorithm on various noisy environments and architectures. For example, on six image classification tasks whose training labels are generated with programmatic labeling, we show a 3.26% accuracy improvement over prior methods. Meanwhile, the Hessian distance measure of the fine-tuned network using our algorithm decreases by six times more than existing approaches.", "bibtex": "@InProceedings{pmlr-v162-ju22a,\n title = \t {Robust Fine-Tuning of Deep Neural Networks with Hessian-based Generalization Guarantees},\n author = {Ju, Haotian and Li, Dongyue and Zhang, Hongyang R},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10431--10461},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ju22a/ju22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ju22a.html},\n abstract = \t {We consider transfer learning approaches that fine-tune a pretrained deep neural network on a target task. We investigate generalization properties of fine-tuning to understand the problem of overfitting, which often happens in practice. Previous works have shown that constraining the distance from the initialization of fine-tuning improves generalization. Using a PAC-Bayesian analysis, we observe that besides distance from initialization, Hessians affect generalization through the noise stability of deep neural networks against noise injections. Motivated by the observation, we develop Hessian distance-based generalization bounds for a wide range of fine-tuning methods. Next, we investigate the robustness of fine-tuning with noisy labels. We design an algorithm that incorporates consistent losses and distance-based regularization for fine-tuning. Additionally, we prove a generalization error bound of our algorithm under class conditional independent noise in the training dataset labels. We perform a detailed empirical study of our algorithm on various noisy environments and architectures. For example, on six image classification tasks whose training labels are generated with programmatic labeling, we show a 3.26% accuracy improvement over prior methods. Meanwhile, the Hessian distance measure of the fine-tuned network using our algorithm decreases by six times more than existing approaches.}\n}", "pdf": "https://proceedings.mlr.press/v162/ju22a/ju22a.pdf", "supp": "", "pdf_size": 858087, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6709344473214339936&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Northeastern University; Northeastern University; Northeastern University", "aff_domain": "northeastern.edu;northeastern.edu;northeastern.edu", "email": "northeastern.edu;northeastern.edu;northeastern.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/ju22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Robust Group Synchronization via Quadratic Programming", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17255", "id": "17255", "proceeding": "https://proceedings.mlr.press/v162/shi22g.html", "poster": "/media/PosterPDFs/ICML%202022/ec1f764517b7ffb52057af6df18142b7.png?t=1657742890.141469", "slides": "", "author_site": "Yunpeng Shi, Cole Wyeth, Gilad Lerman", "author": "Yunpeng Shi; Cole M Wyeth; Gilad Lerman", "abstract": "We propose a novel quadratic programming formulation for estimating the corruption levels in group synchronization, and use these estimates to solve this problem. Our objective function exploits the cycle consistency of the group and we thus refer to our method as detection and estimation of structural consistency (DESC). This general framework can be extended to other algebraic and geometric structures. Our formulation has the following advantages: it can tolerate corruption as high as the information-theoretic bound, it does not require a good initialization for the estimates of group elements, it has a simple interpretation, and under some mild conditions the global minimum of our objective function exactly recovers the corruption levels. We demonstrate the competitive accuracy of our approach on both synthetic and real data experiments of rotation averaging.", "bibtex": "@InProceedings{pmlr-v162-shi22g,\n title = \t {Robust Group Synchronization via Quadratic Programming},\n author = {Shi, Yunpeng and Wyeth, Cole M and Lerman, Gilad},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20095--20105},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shi22g/shi22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/shi22g.html},\n abstract = \t {We propose a novel quadratic programming formulation for estimating the corruption levels in group synchronization, and use these estimates to solve this problem. Our objective function exploits the cycle consistency of the group and we thus refer to our method as detection and estimation of structural consistency (DESC). This general framework can be extended to other algebraic and geometric structures. Our formulation has the following advantages: it can tolerate corruption as high as the information-theoretic bound, it does not require a good initialization for the estimates of group elements, it has a simple interpretation, and under some mild conditions the global minimum of our objective function exactly recovers the corruption levels. We demonstrate the competitive accuracy of our approach on both synthetic and real data experiments of rotation averaging.}\n}", "pdf": "https://proceedings.mlr.press/v162/shi22g/shi22g.pdf", "supp": "", "pdf_size": 620396, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14329242327668843280&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Program in Applied and Computational Mathematics, Princeton University; School of Mathematics, University of Minnesota; School of Mathematics, University of Minnesota", "aff_domain": "princeton.edu;umn.edu;umn.edu", "email": "princeton.edu;umn.edu;umn.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/shi22g.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Princeton University;University of Minnesota", "aff_unique_dep": "Program in Applied and Computational Mathematics;School of Mathematics", "aff_unique_url": "https://www.princeton.edu;https://www.math.umn.edu", "aff_unique_abbr": "Princeton;UMN", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Minneapolis", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Robust Imitation Learning against Variations in Environment Dynamics", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17563", "id": "17563", "proceeding": "https://proceedings.mlr.press/v162/chae22a.html", "poster": "/media/PosterPDFs/ICML%202022/7fbe9c04f99dbfbc738ae9079740a314.png?t=1657159379.7902982", "slides": "/media/icml-2022/Slides/17563_ISAShNJ.pdf", "author_site": "Jongseong Chae, Seungyul Han, Whiyoung Jung, MYUNG-SIK CHO, Sungho Choi, Youngchul Sung", "author": "Jongseong Chae; Seungyul Han; Whiyoung Jung; Myungsik Cho; Sungho Choi; Youngchul Sung", "abstract": "In this paper, we propose a robust imitation learning (IL) framework that improves the robustness of IL when environment dynamics are perturbed. The existing IL framework trained in a single environment can catastrophically fail with perturbations in environment dynamics because it does not capture the situation that underlying environment dynamics can be changed. Our framework effectively deals with environments with varying dynamics by imitating multiple experts in sampled environment dynamics to enhance the robustness in general variations in environment dynamics. In order to robustly imitate the multiple sample experts, we minimize the risk with respect to the Jensen-Shannon divergence between the agent\u2019s policy and each of the sample experts. Numerical results show that our algorithm significantly improves robustness against dynamics perturbations compared to conventional IL baselines.", "bibtex": "@InProceedings{pmlr-v162-chae22a,\n title = \t {Robust Imitation Learning against Variations in Environment Dynamics},\n author = {Chae, Jongseong and Han, Seungyul and Jung, Whiyoung and Cho, Myungsik and Choi, Sungho and Sung, Youngchul},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2828--2852},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chae22a/chae22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chae22a.html},\n abstract = \t {In this paper, we propose a robust imitation learning (IL) framework that improves the robustness of IL when environment dynamics are perturbed. The existing IL framework trained in a single environment can catastrophically fail with perturbations in environment dynamics because it does not capture the situation that underlying environment dynamics can be changed. Our framework effectively deals with environments with varying dynamics by imitating multiple experts in sampled environment dynamics to enhance the robustness in general variations in environment dynamics. In order to robustly imitate the multiple sample experts, we minimize the risk with respect to the Jensen-Shannon divergence between the agent\u2019s policy and each of the sample experts. Numerical results show that our algorithm significantly improves robustness against dynamics perturbations compared to conventional IL baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/chae22a/chae22a.pdf", "supp": "", "pdf_size": 1870210, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16698148577673896615&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "School of Electrical Engineering, KAIST, Daejeon, South Korea; Artificial Intelligence Graduate School, UNIST, Ulsan, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea", "aff_domain": "unist.ac.kr; ; ; ; ; ", "email": "unist.ac.kr; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/chae22a.html", "aff_unique_index": "0;1;0;0;0;0", "aff_unique_norm": "KAIST;Ulsan National Institute of Science and Technology", "aff_unique_dep": "School of Electrical Engineering;Artificial Intelligence Graduate School", "aff_unique_url": "https://www.kaist.ac.kr;https://www.unist.ac.kr", "aff_unique_abbr": "KAIST;UNIST", "aff_campus_unique_index": "0;1;0;0;0;0", "aff_campus_unique": "Daejeon;Ulsan", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Robust Kernel Density Estimation with Median-of-Means principle", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16259", "id": "16259", "proceeding": "https://proceedings.mlr.press/v162/humbert22a.html", "poster": "/media/PosterPDFs/ICML%202022/a6db4ed04f1621a119799fd3d7545d3d.png?t=1657552816.317489", "slides": "", "author_site": "Pierre Humbert, Batiste Le Bars, Ludovic Minvielle", "author": "Pierre Humbert; Batiste Le Bars; Ludovic Minvielle", "abstract": "In this paper, we introduce a robust non-parametric density estimator combining the popular Kernel Density Estimation method and the Median-of-Means principle (MoM-KDE). This estimator is shown to achieve robustness for a large class of anomalous data, potentially adversarial. In particular, while previous works only prove consistency results under very specific contamination models, this work provides finite-sample high-probability error-bounds without any prior knowledge on the outliers. To highlight the robustness of our method, we introduce an influence function adapted to the considered OUI framework. Finally, we show that MoM-KDE achieves competitive results when compared with other robust kernel estimators, while having significantly lower computational complexity.", "bibtex": "@InProceedings{pmlr-v162-humbert22a,\n title = \t {Robust Kernel Density Estimation with Median-of-Means principle},\n author = {Humbert, Pierre and Bars, Batiste Le and Minvielle, Ludovic},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9444--9465},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/humbert22a/humbert22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/humbert22a.html},\n abstract = \t {In this paper, we introduce a robust non-parametric density estimator combining the popular Kernel Density Estimation method and the Median-of-Means principle (MoM-KDE). This estimator is shown to achieve robustness for a large class of anomalous data, potentially adversarial. In particular, while previous works only prove consistency results under very specific contamination models, this work provides finite-sample high-probability error-bounds without any prior knowledge on the outliers. To highlight the robustness of our method, we introduce an influence function adapted to the considered OUI framework. Finally, we show that MoM-KDE achieves competitive results when compared with other robust kernel estimators, while having significantly lower computational complexity.}\n}", "pdf": "https://proceedings.mlr.press/v162/humbert22a/humbert22a.pdf", "supp": "", "pdf_size": 1554002, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14673811907284819215&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Universit\u00e9 Paris-Saclay, CNRS, Inria, Laboratoire de math\u00e9matiques d\u2019Orsay, 91405, Orsay, France; Universit\u00e9 Lille, CNRS, Inria, Centrale Lille, UMR 9189 - CRIStAL, F-59000 Lille; Universit\u00e9 Paris-Saclay, ENS Paris-Saclay, CNRS, Centre Borelli, F-91190, Gif-sur-Yvette, France", "aff_domain": "universite-paris-saclay.fr;inria.fr; ", "email": "universite-paris-saclay.fr;inria.fr; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/humbert22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Universit\u00e9 Paris-Saclay;Universit\u00e9 Lille", "aff_unique_dep": "Laboratoire de math\u00e9matiques d\u2019Orsay;", "aff_unique_url": "https://www.universite-paris-saclay.fr;https://www.univ-lille.fr", "aff_unique_abbr": "UPS;ULille", "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "Orsay;Lille;Gif-sur-Yvette", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Robust Meta-learning with Sampling Noise and Label Noise via Eigen-Reptile", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17029", "id": "17029", "proceeding": "https://proceedings.mlr.press/v162/chen22aa.html", "poster": "/media/PosterPDFs/ICML%202022/cc7e2b878868cbae992d1fb743995d8f.png?t=1657369871.3448713", "slides": "", "author_site": "Dong Chen, Lingfei Wu, Siliang Tang, Xiao Yun, Bo Long, Yueting Zhuang", "author": "Dong Chen; Lingfei Wu; Siliang Tang; Xiao Yun; Bo Long; Yueting Zhuang", "abstract": "Recent years have seen a surge of interest in meta-learning techniques for tackling the few-shot learning (FSL) problem. However, the meta-learner is prone to overfitting since there are only a few available samples, which can be identified as sampling noise on a clean dataset. Besides, when handling the data with noisy labels, the meta-learner could be extremely sensitive to label noise on a corrupted dataset. To address these two challenges, we present Eigen-Reptile (ER) that updates the meta-parameters with the main direction of historical task-specific parameters. Specifically, the main direction is computed in a fast way, where the scale of the calculated matrix is related to the number of gradient steps for the specific task instead of the number of parameters. Furthermore, to obtain a more accurate main direction for Eigen-Reptile in the presence of many noisy labels, we further propose Introspective Self-paced Learning (ISPL). We have theoretically and experimentally demonstrated the soundness and effectiveness of the proposed Eigen-Reptile and ISPL. Particularly, our experiments on different tasks show that the proposed method is able to outperform or achieve highly competitive performance compared with other gradient-based methods with or without noisy labels. The code and data for the proposed method are provided for research purposes https://github.com/Anfeather/Eigen-Reptile.", "bibtex": "@InProceedings{pmlr-v162-chen22aa,\n title = \t {Robust Meta-learning with Sampling Noise and Label Noise via Eigen-Reptile},\n author = {Chen, Dong and Wu, Lingfei and Tang, Siliang and Yun, Xiao and Long, Bo and Zhuang, Yueting},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3662--3678},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22aa/chen22aa.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22aa.html},\n abstract = \t {Recent years have seen a surge of interest in meta-learning techniques for tackling the few-shot learning (FSL) problem. However, the meta-learner is prone to overfitting since there are only a few available samples, which can be identified as sampling noise on a clean dataset. Besides, when handling the data with noisy labels, the meta-learner could be extremely sensitive to label noise on a corrupted dataset. To address these two challenges, we present Eigen-Reptile (ER) that updates the meta-parameters with the main direction of historical task-specific parameters. Specifically, the main direction is computed in a fast way, where the scale of the calculated matrix is related to the number of gradient steps for the specific task instead of the number of parameters. Furthermore, to obtain a more accurate main direction for Eigen-Reptile in the presence of many noisy labels, we further propose Introspective Self-paced Learning (ISPL). We have theoretically and experimentally demonstrated the soundness and effectiveness of the proposed Eigen-Reptile and ISPL. Particularly, our experiments on different tasks show that the proposed method is able to outperform or achieve highly competitive performance compared with other gradient-based methods with or without noisy labels. The code and data for the proposed method are provided for research purposes https://github.com/Anfeather/Eigen-Reptile.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22aa/chen22aa.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chen22aa-supp.zip", "pdf_size": 2181042, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8530355739289210050&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "College of Computer Science and Technology, Zhe Jiang University, Hang Zhou, China; JD.COM Silicon Valley Research Center, 675 E Middlefield Rd, Mountain View, CA 94043 USA; College of Computer Science and Technology, Zhe Jiang University, Hang Zhou, China; JD.COM Silicon Valley Research Center, 675 E Middlefield Rd, Mountain View, CA 94043 USA; JD.COM Silicon Valley Research Center, 675 E Middlefield Rd, Mountain View, CA 94043 USA; College of Computer Science and Technology, Zhe Jiang University, Hang Zhou, China", "aff_domain": "zju.edu.cn;jd.com;zju.edu.cn;jd.com;jd.com;zju.edu.cn", "email": "zju.edu.cn;jd.com;zju.edu.cn;jd.com;jd.com;zju.edu.cn", "github": "https://github.com/Anfeather/Eigen-Reptile", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/chen22aa.html", "aff_unique_index": "0;1;0;1;1;0", "aff_unique_norm": "Zhejiang University;JD.com", "aff_unique_dep": "College of Computer Science and Technology;Research Center", "aff_unique_url": "http://www.zju.edu.cn;https://www.jd.com", "aff_unique_abbr": "ZJU;JD", "aff_campus_unique_index": "0;1;0;1;1;0", "aff_campus_unique": "Hangzhou;Silicon Valley", "aff_country_unique_index": "0;1;0;1;1;0", "aff_country_unique": "China;United States" }, { "title": "Robust Models Are More Interpretable Because Attributions Look Normal", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16003", "id": "16003", "proceeding": "https://proceedings.mlr.press/v162/wang22e.html", "poster": "/media/PosterPDFs/ICML%202022/d139db6a236200b21cc7f752979132d0.png?t=1657759326.5748587", "slides": "", "author_site": "Zifan Wang, Matt Fredrikson, Anupam Datta", "author": "Zifan Wang; Matt Fredrikson; Anupam Datta", "abstract": "Recent work has found that adversarially-robust deep networks used for image classification are more interpretable: their feature attributions tend to be sharper, and are more concentrated on the objects associated with the image\u2019s ground- truth class. We show that smooth decision boundaries play an important role in this enhanced interpretability, as the model\u2019s input gradients around data points will more closely align with boundaries\u2019 normal vectors when they are smooth. Thus, because robust models have smoother boundaries, the results of gradient- based attribution methods, like Integrated Gradients and DeepLift, will capture more accurate information about nearby decision boundaries. This understanding of robust interpretability leads to our second contribution: boundary attributions, which aggregate information about the normal vectors of local decision bound- aries to explain a classification outcome. We show that by leveraging the key fac- tors underpinning robust interpretability, boundary attributions produce sharper, more concentrated visual explanations{\u2014}even on non-robust models.", "bibtex": "@InProceedings{pmlr-v162-wang22e,\n title = \t {Robust Models Are More Interpretable Because Attributions Look Normal},\n author = {Wang, Zifan and Fredrikson, Matt and Datta, Anupam},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22625--22651},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22e/wang22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22e.html},\n abstract = \t {Recent work has found that adversarially-robust deep networks used for image classification are more interpretable: their feature attributions tend to be sharper, and are more concentrated on the objects associated with the image\u2019s ground- truth class. We show that smooth decision boundaries play an important role in this enhanced interpretability, as the model\u2019s input gradients around data points will more closely align with boundaries\u2019 normal vectors when they are smooth. Thus, because robust models have smoother boundaries, the results of gradient- based attribution methods, like Integrated Gradients and DeepLift, will capture more accurate information about nearby decision boundaries. This understanding of robust interpretability leads to our second contribution: boundary attributions, which aggregate information about the normal vectors of local decision bound- aries to explain a classification outcome. We show that by leveraging the key fac- tors underpinning robust interpretability, boundary attributions produce sharper, more concentrated visual explanations{\u2014}even on non-robust models.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22e/wang22e.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wang22e-supp.zip", "pdf_size": 24675869, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14430069598728045155&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Carnegie Mellon University; Carnegie Mellon University; Carnegie Mellon University", "aff_domain": "cmu.edu; ; ", "email": "cmu.edu; ; ", "github": "https://github.com/zifanw/boundary", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22e.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Robust Multi-Objective Bayesian Optimization Under Input Noise", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16837", "id": "16837", "proceeding": "https://proceedings.mlr.press/v162/daulton22a.html", "poster": "/media/PosterPDFs/ICML%202022/94f6d7e04a4d452035300f18b984988c.png?t=1658345037.8764822", "slides": "", "author_site": "Samuel Daulton, Sait Cakmak, Maximilian Balandat, Michael A Osborne, Enlu Zhou, Eytan Bakshy", "author": "Samuel Daulton; Sait Cakmak; Maximilian Balandat; Michael A. Osborne; Enlu Zhou; Eytan Bakshy", "abstract": "Bayesian optimization (BO) is a sample-efficient approach for tuning design parameters to optimize expensive-to-evaluate, black-box performance metrics. In many manufacturing processes, the design parameters are subject to random input noise, resulting in a product that is often less performant than expected. Although BO methods have been proposed for optimizing a single objective under input noise, no existing method addresses the practical scenario where there are multiple objectives that are sensitive to input perturbations. In this work, we propose the first multi-objective BO method that is robust to input noise. We formalize our goal as optimizing the multivariate value-at-risk (MVaR), a risk measure of the uncertain objectives. Since directly optimizing MVaR is computationally infeasible in many settings, we propose a scalable, theoretically-grounded approach for optimizing MVaR using random scalarizations. Empirically, we find that our approach significantly outperforms alternative methods and efficiently identifies optimal robust designs that will satisfy specifications across multiple metrics with high probability.", "bibtex": "@InProceedings{pmlr-v162-daulton22a,\n title = \t {Robust Multi-Objective {B}ayesian Optimization Under Input Noise},\n author = {Daulton, Samuel and Cakmak, Sait and Balandat, Maximilian and Osborne, Michael A. and Zhou, Enlu and Bakshy, Eytan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4831--4866},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/daulton22a/daulton22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/daulton22a.html},\n abstract = \t {Bayesian optimization (BO) is a sample-efficient approach for tuning design parameters to optimize expensive-to-evaluate, black-box performance metrics. In many manufacturing processes, the design parameters are subject to random input noise, resulting in a product that is often less performant than expected. Although BO methods have been proposed for optimizing a single objective under input noise, no existing method addresses the practical scenario where there are multiple objectives that are sensitive to input perturbations. In this work, we propose the first multi-objective BO method that is robust to input noise. We formalize our goal as optimizing the multivariate value-at-risk (MVaR), a risk measure of the uncertain objectives. Since directly optimizing MVaR is computationally infeasible in many settings, we propose a scalable, theoretically-grounded approach for optimizing MVaR using random scalarizations. Empirically, we find that our approach significantly outperforms alternative methods and efficiently identifies optimal robust designs that will satisfy specifications across multiple metrics with high probability.}\n}", "pdf": "https://proceedings.mlr.press/v162/daulton22a/daulton22a.pdf", "supp": "", "pdf_size": 1282812, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14538783621300673718&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Meta; Meta + University of Oxford; Meta; University of Oxford; Georgia Institute of Technology; Meta", "aff_domain": "fb.com;fb.com; ;cs.ox.ac.uk; gatech.edu; ", "email": "fb.com;fb.com; ;cs.ox.ac.uk; gatech.edu; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/daulton22a.html", "aff_unique_index": "0;0+1;0;1;2;0", "aff_unique_norm": "Meta;University of Oxford;Georgia Institute of Technology", "aff_unique_dep": "Meta Platforms, Inc.;;", "aff_unique_url": "https://meta.com;https://www.ox.ac.uk;https://www.gatech.edu", "aff_unique_abbr": "Meta;Oxford;Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Robust Policy Learning over Multiple Uncertainty Sets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17151", "id": "17151", "proceeding": "https://proceedings.mlr.press/v162/xie22c.html", "poster": "/media/PosterPDFs/ICML%202022/9087cd8bfa9c1968b20d8f6d0b81cbbb_0cKwqHa.png?t=1657487674.2959383", "slides": "", "author_site": "Annie Xie, Shagun Sodhani, Chelsea Finn, Joelle Pineau, Amy Zhang", "author": "Annie Xie; Shagun Sodhani; Chelsea Finn; Joelle Pineau; Amy Zhang", "abstract": "Reinforcement learning (RL) agents need to be robust to variations in safety-critical environments. While system identification methods provide a way to infer the variation from online experience, they can fail in settings where fast identification is not possible. Another dominant approach is robust RL which produces a policy that can handle worst-case scenarios, but these methods are generally designed to achieve robustness to a single uncertainty set that must be specified at train time. Towards a more general solution, we formulate the multi-set robustness problem to learn a policy robust to different perturbation sets. We then design an algorithm that enjoys the benefits of both system identification and robust RL: it reduces uncertainty where possible given a few interactions, but can still act robustly with respect to the remaining uncertainty. On a diverse set of control tasks, our approach demonstrates improved worst-case performance on new environments compared to prior methods based on system identification and on robust RL alone.", "bibtex": "@InProceedings{pmlr-v162-xie22c,\n title = \t {Robust Policy Learning over Multiple Uncertainty Sets},\n author = {Xie, Annie and Sodhani, Shagun and Finn, Chelsea and Pineau, Joelle and Zhang, Amy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24414--24429},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xie22c/xie22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/xie22c.html},\n abstract = \t {Reinforcement learning (RL) agents need to be robust to variations in safety-critical environments. While system identification methods provide a way to infer the variation from online experience, they can fail in settings where fast identification is not possible. Another dominant approach is robust RL which produces a policy that can handle worst-case scenarios, but these methods are generally designed to achieve robustness to a single uncertainty set that must be specified at train time. Towards a more general solution, we formulate the multi-set robustness problem to learn a policy robust to different perturbation sets. We then design an algorithm that enjoys the benefits of both system identification and robust RL: it reduces uncertainty where possible given a few interactions, but can still act robustly with respect to the remaining uncertainty. On a diverse set of control tasks, our approach demonstrates improved worst-case performance on new environments compared to prior methods based on system identification and on robust RL alone.}\n}", "pdf": "https://proceedings.mlr.press/v162/xie22c/xie22c.pdf", "supp": "", "pdf_size": 1956108, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=156603264986615112&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Stanford University; Facebook AI Research; Stanford University; Facebook AI Research; Facebook AI Research", "aff_domain": "stanford.edu; ; ; ; ", "email": "stanford.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/xie22c.html", "aff_unique_index": "0;1;0;1;1", "aff_unique_norm": "Stanford University;Meta", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.stanford.edu;https://research.facebook.com", "aff_unique_abbr": "Stanford;FAIR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Robust SDE-Based Variational Formulations for Solving Linear PDEs via Deep Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16351", "id": "16351", "proceeding": "https://proceedings.mlr.press/v162/richter22a.html", "poster": "/media/PosterPDFs/ICML%202022/30ba105754346aaf47509089d2287f2a.png?t=1657285968.2076952", "slides": "", "author_site": "Lorenz Richter, Julius Berner", "author": "Lorenz Richter; Julius Berner", "abstract": "The combination of Monte Carlo methods and deep learning has recently led to efficient algorithms for solving partial differential equations (PDEs) in high dimensions. Related learning problems are often stated as variational formulations based on associated stochastic differential equations (SDEs), which allow the minimization of corresponding losses using gradient-based optimization methods. In respective numerical implementations it is therefore crucial to rely on adequate gradient estimators that exhibit low variance in order to reach convergence accurately and swiftly. In this article, we rigorously investigate corresponding numerical aspects that appear in the context of linear Kolmogorov PDEs. In particular, we systematically compare existing deep learning approaches and provide theoretical explanations for their performances. Subsequently, we suggest novel methods that can be shown to be more robust both theoretically and numerically, leading to substantial performance improvements.", "bibtex": "@InProceedings{pmlr-v162-richter22a,\n title = \t {Robust {SDE}-Based Variational Formulations for Solving Linear {PDE}s via Deep Learning},\n author = {Richter, Lorenz and Berner, Julius},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18649--18666},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/richter22a/richter22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/richter22a.html},\n abstract = \t {The combination of Monte Carlo methods and deep learning has recently led to efficient algorithms for solving partial differential equations (PDEs) in high dimensions. Related learning problems are often stated as variational formulations based on associated stochastic differential equations (SDEs), which allow the minimization of corresponding losses using gradient-based optimization methods. In respective numerical implementations it is therefore crucial to rely on adequate gradient estimators that exhibit low variance in order to reach convergence accurately and swiftly. In this article, we rigorously investigate corresponding numerical aspects that appear in the context of linear Kolmogorov PDEs. In particular, we systematically compare existing deep learning approaches and provide theoretical explanations for their performances. Subsequently, we suggest novel methods that can be shown to be more robust both theoretically and numerically, leading to substantial performance improvements.}\n}", "pdf": "https://proceedings.mlr.press/v162/richter22a/richter22a.pdf", "supp": "", "pdf_size": 705006, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5839668907631655505&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "dida Datenschmiede GmbH, Germany+Zuse Institute Berlin, Germany+Institute of Mathematics, Freie Universit\u00e4t Berlin, Germany; Faculty of Mathematics, University of Vienna, Austria", "aff_domain": "fu-berlin.de;univie.ac.at", "email": "fu-berlin.de;univie.ac.at", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/richter22a.html", "aff_unique_index": "0+1+2;3", "aff_unique_norm": "dida Datenschmiede GmbH;Zuse Institute Berlin;Freie Universit\u00e4t Berlin;University of Vienna", "aff_unique_dep": ";;Institute of Mathematics;Faculty of Mathematics", "aff_unique_url": ";https://www.zib.de;https://www.fu-berlin.de;https://univie.ac.at", "aff_unique_abbr": ";ZIB;FU Berlin;Uni Vienna", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berlin", "aff_country_unique_index": "0+0+0;1", "aff_country_unique": "Germany;Austria" }, { "title": "Robust Task Representations for Offline Meta-Reinforcement Learning via Contrastive Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16239", "id": "16239", "proceeding": "https://proceedings.mlr.press/v162/yuan22a.html", "poster": "/media/PosterPDFs/ICML%202022/2c79b73d2716e9470ec621310f08e6fe.png?t=1657273213.9949708", "slides": "", "author_site": "Haoqi Yuan, Zongqing Lu", "author": "Haoqi Yuan; Zongqing Lu", "abstract": "We study offline meta-reinforcement learning, a practical reinforcement learning paradigm that learns from offline data to adapt to new tasks. The distribution of offline data is determined jointly by the behavior policy and the task. Existing offline meta-reinforcement learning algorithms cannot distinguish these factors, making task representations unstable to the change of behavior policies. To address this problem, we propose a contrastive learning framework for task representations that are robust to the distribution mismatch of behavior policies in training and test. We design a bi-level encoder structure, use mutual information maximization to formalize task representation learning, derive a contrastive learning objective, and introduce several approaches to approximate the true distribution of negative pairs. Experiments on a variety of offline meta-reinforcement learning benchmarks demonstrate the advantages of our method over prior methods, especially on the generalization to out-of-distribution behavior policies.", "bibtex": "@InProceedings{pmlr-v162-yuan22a,\n title = \t {Robust Task Representations for Offline Meta-Reinforcement Learning via Contrastive Learning},\n author = {Yuan, Haoqi and Lu, Zongqing},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25747--25759},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yuan22a/yuan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yuan22a.html},\n abstract = \t {We study offline meta-reinforcement learning, a practical reinforcement learning paradigm that learns from offline data to adapt to new tasks. The distribution of offline data is determined jointly by the behavior policy and the task. Existing offline meta-reinforcement learning algorithms cannot distinguish these factors, making task representations unstable to the change of behavior policies. To address this problem, we propose a contrastive learning framework for task representations that are robust to the distribution mismatch of behavior policies in training and test. We design a bi-level encoder structure, use mutual information maximization to formalize task representation learning, derive a contrastive learning objective, and introduce several approaches to approximate the true distribution of negative pairs. Experiments on a variety of offline meta-reinforcement learning benchmarks demonstrate the advantages of our method over prior methods, especially on the generalization to out-of-distribution behavior policies.}\n}", "pdf": "https://proceedings.mlr.press/v162/yuan22a/yuan22a.pdf", "supp": "", "pdf_size": 772985, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5539110127380539643&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "School of Computer Science, Peking University; School of Computer Science, Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn", "email": "pku.edu.cn;pku.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/yuan22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Peking University", "aff_unique_dep": "School of Computer Science", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "PKU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Robust Training of Neural Networks Using Scale Invariant Architectures", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18249", "id": "18249", "proceeding": "https://proceedings.mlr.press/v162/li22b.html", "poster": "/media/PosterPDFs/ICML%202022/878a0658e652765c4979dba411787e43_RgFny5i.png?t=1658355731.206429", "slides": "", "author_site": "Zhiyuan Li, Srinadh Bhojanapalli, Manzil Zaheer, Sashank Jakkam Reddi, Sanjiv Kumar", "author": "Zhiyuan Li; Srinadh Bhojanapalli; Manzil Zaheer; Sashank Reddi; Sanjiv Kumar", "abstract": "In contrast to SGD, adaptive gradient methods like Adam allow robust training of modern deep networks, especially large language models. However, the use of adaptivity not only comes at the cost of extra memory but also raises the fundamental question: can non-adaptive methods like SGD enjoy similar benefits? In this paper, we provide an affirmative answer to this question by proposing to achieve both robust and memory-efficient training via the following general recipe: (1) modify the architecture and make it scale invariant, (2) train with SGD and weight decay, and optionally (3) clip the global gradient norm proportional to weight norm multiplied by $\\sqrt{\\frac{2\\lambda}{\\eta}}$, where $\\eta$ is learning rate and $\\lambda$ is weight decay. We show that this general approach is robust to rescaling of parameter and loss by proving that its convergence only depends logarithmically on the scale of initialization and loss, whereas the standard SGD might not even converge for many initializations. Following our recipe, we design a scale invariant version of BERT, called SIBERT, which when trained simply by vanilla SGD achieves performance comparable to BERT trained by adaptive methods like Adam on downstream tasks.", "bibtex": "@InProceedings{pmlr-v162-li22b,\n title = \t {Robust Training of Neural Networks Using Scale Invariant Architectures},\n author = {Li, Zhiyuan and Bhojanapalli, Srinadh and Zaheer, Manzil and Reddi, Sashank and Kumar, Sanjiv},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12656--12684},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22b/li22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22b.html},\n abstract = \t {In contrast to SGD, adaptive gradient methods like Adam allow robust training of modern deep networks, especially large language models. However, the use of adaptivity not only comes at the cost of extra memory but also raises the fundamental question: can non-adaptive methods like SGD enjoy similar benefits? In this paper, we provide an affirmative answer to this question by proposing to achieve both robust and memory-efficient training via the following general recipe: (1) modify the architecture and make it scale invariant, (2) train with SGD and weight decay, and optionally (3) clip the global gradient norm proportional to weight norm multiplied by $\\sqrt{\\frac{2\\lambda}{\\eta}}$, where $\\eta$ is learning rate and $\\lambda$ is weight decay. We show that this general approach is robust to rescaling of parameter and loss by proving that its convergence only depends logarithmically on the scale of initialization and loss, whereas the standard SGD might not even converge for many initializations. Following our recipe, we design a scale invariant version of BERT, called SIBERT, which when trained simply by vanilla SGD achieves performance comparable to BERT trained by adaptive methods like Adam on downstream tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22b/li22b.pdf", "supp": "", "pdf_size": 929255, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13959066646989222014&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Princeton University + Google Research New York; Google Research New York; Google DeepMind New York; Google Research New York; Google Research New York", "aff_domain": "cs.princeton.edu; ; ; ; ", "email": "cs.princeton.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/li22b.html", "aff_unique_index": "0+1;1;1;1;1", "aff_unique_norm": "Princeton University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.princeton.edu;https://research.google", "aff_unique_abbr": "Princeton;Google Research", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Robust Training under Label Noise by Over-parameterization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17127", "id": "17127", "proceeding": "https://proceedings.mlr.press/v162/liu22w.html", "poster": "/media/PosterPDFs/ICML%202022/10c66082c124f8afe3df4886f5e516e0.png?t=1657323576.70746", "slides": "", "author_site": "Sheng Liu, Zhihui Zhu, Qing Qu, Chong You", "author": "Sheng Liu; Zhihui Zhu; Qing Qu; Chong You", "abstract": "Recently, over-parameterized deep networks, with increasingly more network parameters than training samples, have dominated the performances of modern machine learning. However, when the training data is corrupted, it has been well-known that over-parameterized networks tend to overfit and do not generalize. In this work, we propose a principled approach for robust training of over-parameterized deep networks in classification tasks where a proportion of training labels are corrupted. The main idea is yet very simple: label noise is sparse and incoherent with the network learned from clean data, so we model the noise and learn to separate it from the data. Specifically, we model the label noise via another sparse over-parameterization term, and exploit implicit algorithmic regularizations to recover and separate the underlying corruptions. Remarkably, when trained using such a simple method in practice, we demonstrate state-of-the-art test accuracy against label noise on a variety of real datasets. Furthermore, our experimental results are corroborated by theory on simplified linear models, showing that exact separation between sparse noise and low-rank data can be achieved under incoherent conditions. The work opens many interesting directions for improving over-parameterized models by using sparse over-parameterization and implicit regularization. Code is available at https://github.com/shengliu66/SOP.", "bibtex": "@InProceedings{pmlr-v162-liu22w,\n title = \t {Robust Training under Label Noise by Over-parameterization},\n author = {Liu, Sheng and Zhu, Zhihui and Qu, Qing and You, Chong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14153--14172},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22w/liu22w.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22w.html},\n abstract = \t {Recently, over-parameterized deep networks, with increasingly more network parameters than training samples, have dominated the performances of modern machine learning. However, when the training data is corrupted, it has been well-known that over-parameterized networks tend to overfit and do not generalize. In this work, we propose a principled approach for robust training of over-parameterized deep networks in classification tasks where a proportion of training labels are corrupted. The main idea is yet very simple: label noise is sparse and incoherent with the network learned from clean data, so we model the noise and learn to separate it from the data. Specifically, we model the label noise via another sparse over-parameterization term, and exploit implicit algorithmic regularizations to recover and separate the underlying corruptions. Remarkably, when trained using such a simple method in practice, we demonstrate state-of-the-art test accuracy against label noise on a variety of real datasets. Furthermore, our experimental results are corroborated by theory on simplified linear models, showing that exact separation between sparse noise and low-rank data can be achieved under incoherent conditions. The work opens many interesting directions for improving over-parameterized models by using sparse over-parameterization and implicit regularization. Code is available at https://github.com/shengliu66/SOP.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22w/liu22w.pdf", "supp": "", "pdf_size": 765696, "gs_citation": 154, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7351288537652812990&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Center for Data Science, New York University; Electrical and Computer Engineering, University of Denver; Department of EECS, University of Michigan; Google Research, New York City", "aff_domain": "nyu.edu;du.edu;umich.edu;google.com", "email": "nyu.edu;du.edu;umich.edu;google.com", "github": "https://github.com/shengliu66/SOP", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/liu22w.html", "aff_unique_index": "0;1;2;3", "aff_unique_norm": "New York University;University of Denver;University of Michigan;Google", "aff_unique_dep": "Center for Data Science;Electrical and Computer Engineering;Department of Electrical Engineering and Computer Science;Google Research", "aff_unique_url": "https://www.nyu.edu;https://www.du.edu;https://www.umich.edu;https://research.google", "aff_unique_abbr": "NYU;DU;UM;Google Research", "aff_campus_unique_index": "0;2;3", "aff_campus_unique": "New York;;Ann Arbor;New York City", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Robust alignment of cross-session recordings of neural population activity by behaviour via unsupervised domain adaptation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16481", "id": "16481", "proceeding": "https://proceedings.mlr.press/v162/jude22a.html", "poster": "/media/PosterPDFs/ICML%202022/273448411df1962cba1db6c05b3213c9.png?t=1657198512.843701", "slides": "", "author_site": "Justin Jude, Matthew G. Perich, Lee Miller, Matthias Hennig", "author": "Justin Jude; Matthew Perich; Lee Miller; Matthias Hennig", "abstract": "Neural population activity relating to behaviour is assumed to be inherently low-dimensional despite the observed high dimensionality of data recorded using multi-electrode arrays. Therefore, predicting behaviour from neural population recordings has been shown to be most effective when using latent variable models. Over time however, the activity of single neurons can drift, and different neurons will be recorded due to movement of implanted neural probes. This means that a decoder trained to predict behaviour on one day performs worse when tested on a different day. On the other hand, evidence suggests that the latent dynamics underlying behaviour may be stable even over months and years. Based on this idea, we introduce a model capable of inferring behaviourally relevant latent dynamics from previously unseen data recorded from the same animal, without any need for decoder recalibration. We show that unsupervised domain adaptation combined with a sequential variational autoencoder, trained on several sessions, can achieve good generalisation to unseen data and correctly predict behaviour where conventional methods fail. Our results further support the hypothesis that behaviour-related neural dynamics are low-dimensional and stable over time, and will enable more effective and flexible use of brain computer interface technologies.", "bibtex": "@InProceedings{pmlr-v162-jude22a,\n title = \t {Robust alignment of cross-session recordings of neural population activity by behaviour via unsupervised domain adaptation},\n author = {Jude, Justin and Perich, Matthew and Miller, Lee and Hennig, Matthias},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10462--10475},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jude22a/jude22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jude22a.html},\n abstract = \t {Neural population activity relating to behaviour is assumed to be inherently low-dimensional despite the observed high dimensionality of data recorded using multi-electrode arrays. Therefore, predicting behaviour from neural population recordings has been shown to be most effective when using latent variable models. Over time however, the activity of single neurons can drift, and different neurons will be recorded due to movement of implanted neural probes. This means that a decoder trained to predict behaviour on one day performs worse when tested on a different day. On the other hand, evidence suggests that the latent dynamics underlying behaviour may be stable even over months and years. Based on this idea, we introduce a model capable of inferring behaviourally relevant latent dynamics from previously unseen data recorded from the same animal, without any need for decoder recalibration. We show that unsupervised domain adaptation combined with a sequential variational autoencoder, trained on several sessions, can achieve good generalisation to unseen data and correctly predict behaviour where conventional methods fail. Our results further support the hypothesis that behaviour-related neural dynamics are low-dimensional and stable over time, and will enable more effective and flexible use of brain computer interface technologies.}\n}", "pdf": "https://proceedings.mlr.press/v162/jude22a/jude22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/jude22a-supp.zip", "pdf_size": 12942342, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7706646666865733856&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Informatics, University of Edinburgh, Edinburgh, Scotland, EH8 9AB; Universit \u00b4e de Montr \u00b4eal and Mila, Montr \u00b4eal, QC, Canada H3C 3J7; Feinberg School of Medicine, Northwestern, Chicago, IL 60611; School of Informatics, University of Edinburgh, Edinburgh, Scotland, EH8 9AB", "aff_domain": "me.com; ; ; ", "email": "me.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/jude22a.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Edinburgh;Universit\u00e9 de Montr\u00e9al;Northwestern University", "aff_unique_dep": "School of Informatics;;Feinberg School of Medicine", "aff_unique_url": "https://www.ed.ac.uk;https://www.umontreal.ca;https://www.northwestern.edu", "aff_unique_abbr": "Edinburgh;UdeM;NU", "aff_campus_unique_index": "0;1;2;0", "aff_campus_unique": "Edinburgh;Montr\u00e9al;Chicago", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "United Kingdom;Canada;United States" }, { "title": "Robustness Implies Generalization via Data-Dependent Generalization Bounds", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16121", "id": "16121", "proceeding": "https://proceedings.mlr.press/v162/kawaguchi22a.html", "poster": "", "slides": "", "author_site": "Kenji Kawaguchi, Zhun Deng, Kyle Luh, Jiaoyang Huang", "author": "Kenji Kawaguchi; Zhun Deng; Kyle Luh; Jiaoyang Huang", "abstract": "This paper proves that robustness implies generalization via data-dependent generalization bounds. As a result, robustness and generalization are shown to be connected closely in a data-dependent manner. Our bounds improve previous bounds in two directions, to solve an open problem that has seen little development since 2010. The first is to reduce the dependence on the covering number. The second is to remove the dependence on the hypothesis space. We present several examples, including ones for lasso and deep learning, in which our bounds are provably preferable. The experiments on real-world data and theoretical models demonstrate near-exponential improvements in various situations. To achieve these improvements, we do not require additional assumptions on the unknown distribution; instead, we only incorporate an observable and computable property of the training samples. A key technical innovation is an improved concentration bound for multinomial random variables that is of independent interest beyond robustness and generalization.", "bibtex": "@InProceedings{pmlr-v162-kawaguchi22a,\n title = \t {Robustness Implies Generalization via Data-Dependent Generalization Bounds},\n author = {Kawaguchi, Kenji and Deng, Zhun and Luh, Kyle and Huang, Jiaoyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10866--10894},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kawaguchi22a/kawaguchi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kawaguchi22a.html},\n abstract = \t {This paper proves that robustness implies generalization via data-dependent generalization bounds. As a result, robustness and generalization are shown to be connected closely in a data-dependent manner. Our bounds improve previous bounds in two directions, to solve an open problem that has seen little development since 2010. The first is to reduce the dependence on the covering number. The second is to remove the dependence on the hypothesis space. We present several examples, including ones for lasso and deep learning, in which our bounds are provably preferable. The experiments on real-world data and theoretical models demonstrate near-exponential improvements in various situations. To achieve these improvements, we do not require additional assumptions on the unknown distribution; instead, we only incorporate an observable and computable property of the training samples. A key technical innovation is an improved concentration bound for multinomial random variables that is of independent interest beyond robustness and generalization.}\n}", "pdf": "https://proceedings.mlr.press/v162/kawaguchi22a/kawaguchi22a.pdf", "supp": "", "pdf_size": 923151, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=289102848152009419&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "National University of Singapore; Harvard University; University of Colorado Boulder; New York University", "aff_domain": "nus.edu.sg; ; ; ", "email": "nus.edu.sg; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/kawaguchi22a.html", "aff_unique_index": "0;1;2;3", "aff_unique_norm": "National University of Singapore;Harvard University;University of Colorado;New York University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.nus.edu.sg;https://www.harvard.edu;https://www.colorado.edu;https://www.nyu.edu", "aff_unique_abbr": "NUS;Harvard;CU;NYU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Boulder", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Singapore;United States" }, { "title": "Robustness Verification for Contrastive Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16761", "id": "16761", "proceeding": "https://proceedings.mlr.press/v162/wang22q.html", "poster": "/media/PosterPDFs/ICML%202022/3f4366aeb9c157cf9a30c90693eafc55_eZ0SMOW.png?t=1658044891.9146721", "slides": "/media/icml-2022/Slides/16761.pdf", "author_site": "Zekai Wang, Weiwei Liu", "author": "Zekai Wang; Weiwei Liu", "abstract": "Contrastive adversarial training has successfully improved the robustness of contrastive learning (CL). However, the robustness metric used in these methods is linked to attack algorithms, image labels and downstream tasks, all of which may affect the consistency and reliability of robustness metric for CL. To address these problems, this paper proposes a novel Robustness Verification framework for Contrastive Learning (RVCL). Furthermore, we use extreme value theory to reveal the relationship between the robust radius of the CL encoder and that of the supervised downstream task. Extensive experimental results on various benchmark models and datasets verify our theoretical findings, and further demonstrate that our proposed RVCL is able to evaluate the robustness of both models and images. Our code is available at https://github.com/wzekai99/RVCL.", "bibtex": "@InProceedings{pmlr-v162-wang22q,\n title = \t {Robustness Verification for Contrastive Learning},\n author = {Wang, Zekai and Liu, Weiwei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22865--22883},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22q/wang22q.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22q.html},\n abstract = \t {Contrastive adversarial training has successfully improved the robustness of contrastive learning (CL). However, the robustness metric used in these methods is linked to attack algorithms, image labels and downstream tasks, all of which may affect the consistency and reliability of robustness metric for CL. To address these problems, this paper proposes a novel Robustness Verification framework for Contrastive Learning (RVCL). Furthermore, we use extreme value theory to reveal the relationship between the robust radius of the CL encoder and that of the supervised downstream task. Extensive experimental results on various benchmark models and datasets verify our theoretical findings, and further demonstrate that our proposed RVCL is able to evaluate the robustness of both models and images. Our code is available at https://github.com/wzekai99/RVCL.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22q/wang22q.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wang22q-supp.zip", "pdf_size": 874490, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17479701569999738890&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "School of Computer Science, Wuhan University, China; School of Computer Science, Wuhan University, China", "aff_domain": "whu.edu.cn;gmail.com", "email": "whu.edu.cn;gmail.com", "github": "https://github.com/wzekai99/RVCL", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/wang22q.html", "aff_unique_index": "0;0", "aff_unique_norm": "Wuhan University", "aff_unique_dep": "School of Computer Science", "aff_unique_url": "http://www.whu.edu.cn", "aff_unique_abbr": "WHU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Wuhan", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Robustness and Accuracy Could Be Reconcilable by (Proper) Definition", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17019", "id": "17019", "proceeding": "https://proceedings.mlr.press/v162/pang22a.html", "poster": "/media/PosterPDFs/ICML%202022/1579779b98ce9edb98dd85606f2c119d.png?t=1656599985.38685", "slides": "/media/icml-2022/Slides/17019.pdf", "author_site": "Tianyu Pang, Min Lin, Xiao Yang, Jun Zhu, Shuicheng Yan", "author": "Tianyu Pang; Min Lin; Xiao Yang; Jun Zhu; Shuicheng Yan", "abstract": "The trade-off between robustness and accuracy has been widely studied in the adversarial literature. Although still controversial, the prevailing view is that this trade-off is inherent, either empirically or theoretically. Thus, we dig for the origin of this trade-off in adversarial training and find that it may stem from the improperly defined robust error, which imposes an inductive bias of local invariance \u2014 an overcorrection towards smoothness. Given this, we advocate employing local equivariance to describe the ideal behavior of a robust model, leading to a self-consistent robust error named SCORE. By definition, SCORE facilitates the reconciliation between robustness and accuracy, while still handling the worst-case uncertainty via robust optimization. By simply substituting KL divergence with variants of distance metrics, SCORE can be efficiently minimized. Empirically, our models achieve top-rank performance on RobustBench under AutoAttack. Besides, SCORE provides instructive insights for explaining the overfitting phenomenon and semantic input gradients observed on robust models.", "bibtex": "@InProceedings{pmlr-v162-pang22a,\n title = \t {Robustness and Accuracy Could Be Reconcilable by ({P}roper) Definition},\n author = {Pang, Tianyu and Lin, Min and Yang, Xiao and Zhu, Jun and Yan, Shuicheng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17258--17277},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/pang22a/pang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/pang22a.html},\n abstract = \t {The trade-off between robustness and accuracy has been widely studied in the adversarial literature. Although still controversial, the prevailing view is that this trade-off is inherent, either empirically or theoretically. Thus, we dig for the origin of this trade-off in adversarial training and find that it may stem from the improperly defined robust error, which imposes an inductive bias of local invariance \u2014 an overcorrection towards smoothness. Given this, we advocate employing local equivariance to describe the ideal behavior of a robust model, leading to a self-consistent robust error named SCORE. By definition, SCORE facilitates the reconciliation between robustness and accuracy, while still handling the worst-case uncertainty via robust optimization. By simply substituting KL divergence with variants of distance metrics, SCORE can be efficiently minimized. Empirically, our models achieve top-rank performance on RobustBench under AutoAttack. Besides, SCORE provides instructive insights for explaining the overfitting phenomenon and semantic input gradients observed on robust models.}\n}", "pdf": "https://proceedings.mlr.press/v162/pang22a/pang22a.pdf", "supp": "", "pdf_size": 6830128, "gs_citation": 166, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12573058517676493723&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Dept. of Comp. Sci. and Tech., Institute for AI, BNRist Center, THBI Lab, Tsinghua-Bosch Joint Center for ML, Tsinghua University+Sea AI Lab, Singapore; Sea AI Lab, Singapore; Dept. of Comp. Sci. and Tech., Institute for AI, BNRist Center, THBI Lab, Tsinghua-Bosch Joint Center for ML, Tsinghua University; Dept. of Comp. Sci. and Tech., Institute for AI, BNRist Center, THBI Lab, Tsinghua-Bosch Joint Center for ML, Tsinghua University; Sea AI Lab, Singapore", "aff_domain": "tsinghua.edu.cn;sea.com; ;tsinghua.edu.cn;sea.com", "email": "tsinghua.edu.cn;sea.com; ;tsinghua.edu.cn;sea.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/pang22a.html", "aff_unique_index": "0+1;1;0;0;1", "aff_unique_norm": "Tsinghua University;Sea AI Lab", "aff_unique_dep": "Dept. of Comp. Sci. and Tech.;", "aff_unique_url": "https://www.tsinghua.edu.cn;", "aff_unique_abbr": "THU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0;0;1", "aff_country_unique": "China;Singapore" }, { "title": "Robustness in Multi-Objective Submodular Optimization: a Quantile Approach", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17167", "id": "17167", "proceeding": "https://proceedings.mlr.press/v162/malherbe22a.html", "poster": "/media/PosterPDFs/ICML%202022/95192c98732387165bf8e396c0f2dad2.png?t=1657184175.6819127", "slides": "", "author_site": "Cedric Malherbe, Kevin Scaman", "author": "Cedric Malherbe; Kevin Scaman", "abstract": "The optimization of multi-objective submodular systems appears in a wide variety of applications. However, there are currently very few techniques which are able to provide a robust allocation to such systems. In this work, we propose to design and analyse novel algorithms for the robust allocation of submodular systems through lens of quantile maximization. We start by observing that identifying an exact solution for this problem is computationally intractable. To tackle this issue, we propose a proxy for the quantile function using a softmax formulation, and show that this proxy is well suited to submodular optimization. Based on this relaxation, we propose a novel and simple algorithm called SOFTSAT. Theoretical properties are provided for this algorithm as well as novel approximation guarantees. Finally, we provide numerical experiments showing the efficiency of our algorithm with regards to state-of-the-art methods in a test bed of real-world applications, and show that SOFTSAT is particularly robust and well-suited to online scenarios.", "bibtex": "@InProceedings{pmlr-v162-malherbe22a,\n title = \t {Robustness in Multi-Objective Submodular Optimization: a Quantile Approach},\n author = {Malherbe, Cedric and Scaman, Kevin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14871--14886},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/malherbe22a/malherbe22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/malherbe22a.html},\n abstract = \t {The optimization of multi-objective submodular systems appears in a wide variety of applications. However, there are currently very few techniques which are able to provide a robust allocation to such systems. In this work, we propose to design and analyse novel algorithms for the robust allocation of submodular systems through lens of quantile maximization. We start by observing that identifying an exact solution for this problem is computationally intractable. To tackle this issue, we propose a proxy for the quantile function using a softmax formulation, and show that this proxy is well suited to submodular optimization. Based on this relaxation, we propose a novel and simple algorithm called SOFTSAT. Theoretical properties are provided for this algorithm as well as novel approximation guarantees. Finally, we provide numerical experiments showing the efficiency of our algorithm with regards to state-of-the-art methods in a test bed of real-world applications, and show that SOFTSAT is particularly robust and well-suited to online scenarios.}\n}", "pdf": "https://proceedings.mlr.press/v162/malherbe22a/malherbe22a.pdf", "supp": "", "pdf_size": 670773, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4888262158012936637&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Huawei Noah\u2019s Ark Lab; DI ENS, \u00c9cole normale sup\u00e9rieure, CNRS, INRIA, PSL University + Huawei", "aff_domain": "huawei.com; ", "email": "huawei.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/malherbe22a.html", "aff_unique_index": "0;1+0", "aff_unique_norm": "Huawei;\u00c9cole Normale Sup\u00e9rieure", "aff_unique_dep": "Noah\u2019s Ark Lab;DI ENS", "aff_unique_url": "https://www.huawei.com;https://www.ens.fr", "aff_unique_abbr": "Huawei;ENS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+0", "aff_country_unique": "China;France" }, { "title": "Role-based Multiplex Network Embedding", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16571", "id": "16571", "proceeding": "https://proceedings.mlr.press/v162/zhang22m.html", "poster": "", "slides": "/media/icml-2022/Slides/16571.pdf", "author_site": "Hegui Zhang, Gang Kou", "author": "Hegui Zhang; Gang Kou", "abstract": "In recent years, multiplex network embedding has received great attention from researchers. However, existing multiplex network embedding methods neglect structural role information, which can be used to determine the structural similarity between nodes. To overcome this shortcoming, this work proposes a simple, effective, role-based embedding method for multiplex networks, called RMNE. The RMNE uses the structural role information of nodes to preserve the structural similarity between nodes in the entire multiplex network. Specifically, a role-modified random walk is designed to generate node sequences of each node, which can capture both the within-layer neighbors, structural role members, and cross-layer structural role members of a node. Additionally, the variant of RMNE extends the existing collaborative embedding method by unifying the structural role information into our method to obtain the role-based node representations. Finally, the proposed methods were evaluated on the network reconstruction, node classification, link prediction, and multi-class edge classification tasks. The experimental results on eight public, real-world multiplex networks demonstrate that the proposed methods outperform state-of-the-art baseline methods.", "bibtex": "@InProceedings{pmlr-v162-zhang22m,\n title = \t {Role-based Multiplex Network Embedding},\n author = {Zhang, Hegui and Kou, Gang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26265--26280},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22m/zhang22m.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22m.html},\n abstract = \t {In recent years, multiplex network embedding has received great attention from researchers. However, existing multiplex network embedding methods neglect structural role information, which can be used to determine the structural similarity between nodes. To overcome this shortcoming, this work proposes a simple, effective, role-based embedding method for multiplex networks, called RMNE. The RMNE uses the structural role information of nodes to preserve the structural similarity between nodes in the entire multiplex network. Specifically, a role-modified random walk is designed to generate node sequences of each node, which can capture both the within-layer neighbors, structural role members, and cross-layer structural role members of a node. Additionally, the variant of RMNE extends the existing collaborative embedding method by unifying the structural role information into our method to obtain the role-based node representations. Finally, the proposed methods were evaluated on the network reconstruction, node classification, link prediction, and multi-class edge classification tasks. The experimental results on eight public, real-world multiplex networks demonstrate that the proposed methods outperform state-of-the-art baseline methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22m/zhang22m.pdf", "supp": "", "pdf_size": 1024614, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5833839627813461066&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "School of Business Administration, Faculty of Business Administration, Southwestern University of Finance and Economics, Chengdu 611130, China; School of Business Administration, Faculty of Business Administration, Southwestern University of Finance and Economics, Chengdu 611130, China", "aff_domain": "swufe.edu.cn;yahoo.com", "email": "swufe.edu.cn;yahoo.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/zhang22m.html", "aff_unique_index": "0;0", "aff_unique_norm": "Southwestern University of Finance and Economics", "aff_unique_dep": "School of Business Administration", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Chengdu", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Rotting Infinitely Many-Armed Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17823", "id": "17823", "proceeding": "https://proceedings.mlr.press/v162/kim22j.html", "poster": "/media/PosterPDFs/ICML%202022/e1696007be4eefb81b1a1d39ce48681b_3uKwXs4.png?t=1658034982.0668244", "slides": "/media/icml-2022/Slides/17823_TNxmD0A.pdf", "author_site": "Jung-hun Kim, Milan Vojnovic, Se-Young Yun", "author": "Jung-Hun Kim; Milan Vojnovic; Se-Young Yun", "abstract": "We consider the infinitely many-armed bandit problem with rotting rewards, where the mean reward of an arm decreases at each pull of the arm according to an arbitrary trend with maximum rotting rate $\\varrho=o(1)$. We show that this learning problem has an $\\Omega(\\max\\{\\varrho^{1/3}T, \\sqrt{T}\\})$ worst-case regret lower bound where $T$ is the time horizon. We show that a matching upper bound $\\tilde{O}(\\max\\{\\varrho^{1/3}T, \\sqrt{T}\\})$, up to a poly-logarithmic factor, can be achieved by an algorithm that uses a UCB index for each arm and a threshold value to decide whether to continue pulling an arm or remove the arm from further consideration, when the algorithm knows the value of the maximum rotting rate $\\varrho$. We also show that an $\\tilde{O}(\\max\\{\\varrho^{1/3}T, T^{3/4}\\})$ regret upper bound can be achieved by an algorithm that does not know the value of $\\varrho$, by using an adaptive UCB index along with an adaptive threshold value.", "bibtex": "@InProceedings{pmlr-v162-kim22j,\n title = \t {Rotting Infinitely Many-Armed Bandits},\n author = {Kim, Jung-Hun and Vojnovic, Milan and Yun, Se-Young},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11229--11254},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kim22j/kim22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/kim22j.html},\n abstract = \t {We consider the infinitely many-armed bandit problem with rotting rewards, where the mean reward of an arm decreases at each pull of the arm according to an arbitrary trend with maximum rotting rate $\\varrho=o(1)$. We show that this learning problem has an $\\Omega(\\max\\{\\varrho^{1/3}T, \\sqrt{T}\\})$ worst-case regret lower bound where $T$ is the time horizon. We show that a matching upper bound $\\tilde{O}(\\max\\{\\varrho^{1/3}T, \\sqrt{T}\\})$, up to a poly-logarithmic factor, can be achieved by an algorithm that uses a UCB index for each arm and a threshold value to decide whether to continue pulling an arm or remove the arm from further consideration, when the algorithm knows the value of the maximum rotting rate $\\varrho$. We also show that an $\\tilde{O}(\\max\\{\\varrho^{1/3}T, T^{3/4}\\})$ regret upper bound can be achieved by an algorithm that does not know the value of $\\varrho$, by using an adaptive UCB index along with an adaptive threshold value.}\n}", "pdf": "https://proceedings.mlr.press/v162/kim22j/kim22j.pdf", "supp": "", "pdf_size": 547565, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7431943945679360181&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Korea Advanced Institute of Science and Technology (KAIST), Daejeon, South Korea; London School of Economics, London, UK; Korea Advanced Institute of Science and Technology (KAIST), Daejeon, South Korea", "aff_domain": "kaist.ac.kr;lse.ac.uk;kaist.ac.kr", "email": "kaist.ac.kr;lse.ac.uk;kaist.ac.kr", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kim22j.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;London School of Economics", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;https://www.lse.ac.uk", "aff_unique_abbr": "KAIST;LSE", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Daejeon;London", "aff_country_unique_index": "0;1;0", "aff_country_unique": "South Korea;United Kingdom" }, { "title": "SCHA-VAE: Hierarchical Context Aggregation for Few-Shot Generation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17827", "id": "17827", "proceeding": "https://proceedings.mlr.press/v162/giannone22a.html", "poster": "/media/PosterPDFs/ICML%202022/01f78be6f7cad02658508fe4616098a9.png?t=1657326139.6831224", "slides": "", "author_site": "Giorgio Giannone, Ole Winther", "author": "Giorgio Giannone; Ole Winther", "abstract": "A few-shot generative model should be able to generate data from a novel distribution by only observing a limited set of examples. In few-shot learning the model is trained on data from many sets from distributions sharing some underlying properties such as sets of characters from different alphabets or objects from different categories. We extend current latent variable models for sets to a fully hierarchical approach with an attention-based point to set-level aggregation and call our method SCHA-VAE for Set-Context-Hierarchical-Aggregation Variational Autoencoder. We explore likelihood-based model comparison, iterative data sampling, and adaptation-free out-of-distribution generalization. Our results show that the hierarchical formulation better captures the intrinsic variability within the sets in the small data regime. This work generalizes deep latent variable approaches to few-shot learning, taking a step toward large-scale few-shot generation with a formulation that readily works with current state-of-the-art deep generative models.", "bibtex": "@InProceedings{pmlr-v162-giannone22a,\n title = \t {{SCHA}-{VAE}: Hierarchical Context Aggregation for Few-Shot Generation},\n author = {Giannone, Giorgio and Winther, Ole},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7550--7569},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/giannone22a/giannone22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/giannone22a.html},\n abstract = \t {A few-shot generative model should be able to generate data from a novel distribution by only observing a limited set of examples. In few-shot learning the model is trained on data from many sets from distributions sharing some underlying properties such as sets of characters from different alphabets or objects from different categories. We extend current latent variable models for sets to a fully hierarchical approach with an attention-based point to set-level aggregation and call our method SCHA-VAE for Set-Context-Hierarchical-Aggregation Variational Autoencoder. We explore likelihood-based model comparison, iterative data sampling, and adaptation-free out-of-distribution generalization. Our results show that the hierarchical formulation better captures the intrinsic variability within the sets in the small data regime. This work generalizes deep latent variable approaches to few-shot learning, taking a step toward large-scale few-shot generation with a formulation that readily works with current state-of-the-art deep generative models.}\n}", "pdf": "https://proceedings.mlr.press/v162/giannone22a/giannone22a.pdf", "supp": "", "pdf_size": 2347100, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18154128388289892262&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Technical University of Denmark; University of Copenhagen", "aff_domain": "dtu.dk; ", "email": "dtu.dk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/giannone22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Technical University of Denmark;University of Copenhagen", "aff_unique_dep": ";", "aff_unique_url": "https://www.tek.dk;https://www.ku.dk", "aff_unique_abbr": "DTU;UCPH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Denmark" }, { "title": "SDQ: Stochastic Differentiable Quantization with Mixed Precision", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16257", "id": "16257", "proceeding": "https://proceedings.mlr.press/v162/huang22h.html", "poster": "/media/PosterPDFs/ICML%202022/13fe9d84310e77f13a6d184dbf1232f3_cU3yVZ1.png?t=1657524266.5144286", "slides": "", "author_site": "Xijie Huang, Zhiqiang Shen, Shichao Li, Zechun Liu, Hu Xianghong, Jeffry Wicaksana, Eric Xing, Kwang-Ting Cheng", "author": "Xijie Huang; Zhiqiang Shen; Shichao Li; Zechun Liu; Hu Xianghong; Jeffry Wicaksana; Eric Xing; Kwang-Ting Cheng", "abstract": "In order to deploy deep models in a computationally efficient manner, model quantization approaches have been frequently used. In addition, as new hardware that supports various-bit arithmetic operations, recent research on mixed precision quantization (MPQ) begins to fully leverage the capacity of representation by searching various bitwidths for different layers and modules in a network. However, previous studies mainly search the MPQ strategy in a costly scheme using reinforcement learning, neural architecture search, etc., or simply utilize partial prior knowledge for bitwidth distribution, which might be biased and sub-optimal. In this work, we present a novel Stochastic Differentiable Quantization (SDQ) method that can automatically learn the MPQ strategy in a more flexible and globally-optimized space with a smoother gradient approximation. Particularly, Differentiable Bitwidth Parameters (DBPs) are employed as the probability factors in stochastic quantization between adjacent bitwidth. After the optimal MPQ strategy is acquired, we further train our network with the entropy-aware bin regularization and knowledge distillation. We extensively evaluate our method on different networks, hardwares (GPUs and FPGA), and datasets. SDQ outperforms all other state-of-the-art mixed or single precision quantization with less bitwidth, and are even better than the original full-precision counterparts across various ResNet and MobileNet families, demonstrating the effectiveness and superiority of our method. Code will be publicly available.", "bibtex": "@InProceedings{pmlr-v162-huang22h,\n title = \t {{SDQ}: Stochastic Differentiable Quantization with Mixed Precision},\n author = {Huang, Xijie and Shen, Zhiqiang and Li, Shichao and Liu, Zechun and Xianghong, Hu and Wicaksana, Jeffry and Xing, Eric and Cheng, Kwang-Ting},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9295--9309},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22h/huang22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22h.html},\n abstract = \t {In order to deploy deep models in a computationally efficient manner, model quantization approaches have been frequently used. In addition, as new hardware that supports various-bit arithmetic operations, recent research on mixed precision quantization (MPQ) begins to fully leverage the capacity of representation by searching various bitwidths for different layers and modules in a network. However, previous studies mainly search the MPQ strategy in a costly scheme using reinforcement learning, neural architecture search, etc., or simply utilize partial prior knowledge for bitwidth distribution, which might be biased and sub-optimal. In this work, we present a novel Stochastic Differentiable Quantization (SDQ) method that can automatically learn the MPQ strategy in a more flexible and globally-optimized space with a smoother gradient approximation. Particularly, Differentiable Bitwidth Parameters (DBPs) are employed as the probability factors in stochastic quantization between adjacent bitwidth. After the optimal MPQ strategy is acquired, we further train our network with the entropy-aware bin regularization and knowledge distillation. We extensively evaluate our method on different networks, hardwares (GPUs and FPGA), and datasets. SDQ outperforms all other state-of-the-art mixed or single precision quantization with less bitwidth, and are even better than the original full-precision counterparts across various ResNet and MobileNet families, demonstrating the effectiveness and superiority of our method. Code will be publicly available.}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22h/huang22h.pdf", "supp": "", "pdf_size": 1165641, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14712740554643684357&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "email": ";;;;;;;", "github": "", "project": "https://huangowen.github.io/SDQ/", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/huang22h.html" }, { "title": "SE(3) Equivariant Graph Neural Networks with Complete Local Frames", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18345", "id": "18345", "proceeding": "https://proceedings.mlr.press/v162/du22e.html", "poster": "/media/PosterPDFs/ICML%202022/dc6a70712a252123c40d2adba6a11d84.png?t=1658127300.6244795", "slides": "", "author_site": "weitao du, He Zhang, Yuanqi Du, Qi Meng, Wei Chen, Nanning Zheng, Bin Shao, Tie-Yan Liu", "author": "Weitao Du; He Zhang; Yuanqi Du; Qi Meng; Wei Chen; Nanning Zheng; Bin Shao; Tie-Yan Liu", "abstract": "Group equivariance (e.g. SE(3) equivariance) is a critical physical symmetry in science, from classical and quantum physics to computational biology. It enables robust and accurate prediction under arbitrary reference transformations. In light of this, great efforts have been put on encoding this symmetry into deep neural networks, which has been shown to improve the generalization performance and data efficiency for downstream tasks. Constructing an equivariant neural network generally brings high computational costs to ensure expressiveness. Therefore, how to better trade-off the expressiveness and computational efficiency plays a core role in the design of the equivariant deep learning models. In this paper, we propose a framework to construct SE(3) equivariant graph neural networks that can approximate the geometric quantities efficiently. Inspired by differential geometry and physics, we introduce equivariant local complete frames to graph neural networks, such that tensor information at given orders can be projected onto the frames. The local frame is constructed to form an orthonormal basis that avoids direction degeneration and ensure completeness. Since the frames are built only by cross product operations, our method is computationally efficient. We evaluate our method on two tasks: Newton mechanics modeling and equilibrium molecule conformation generation. Extensive experimental results demonstrate that our model achieves the best or competitive performance in two types of datasets.", "bibtex": "@InProceedings{pmlr-v162-du22e,\n title = \t {{SE}(3) Equivariant Graph Neural Networks with Complete Local Frames},\n author = {Du, Weitao and Zhang, He and Du, Yuanqi and Meng, Qi and Chen, Wei and Zheng, Nanning and Shao, Bin and Liu, Tie-Yan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5583--5608},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/du22e/du22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/du22e.html},\n abstract = \t {Group equivariance (e.g. SE(3) equivariance) is a critical physical symmetry in science, from classical and quantum physics to computational biology. It enables robust and accurate prediction under arbitrary reference transformations. In light of this, great efforts have been put on encoding this symmetry into deep neural networks, which has been shown to improve the generalization performance and data efficiency for downstream tasks. Constructing an equivariant neural network generally brings high computational costs to ensure expressiveness. Therefore, how to better trade-off the expressiveness and computational efficiency plays a core role in the design of the equivariant deep learning models. In this paper, we propose a framework to construct SE(3) equivariant graph neural networks that can approximate the geometric quantities efficiently. Inspired by differential geometry and physics, we introduce equivariant local complete frames to graph neural networks, such that tensor information at given orders can be projected onto the frames. The local frame is constructed to form an orthonormal basis that avoids direction degeneration and ensure completeness. Since the frames are built only by cross product operations, our method is computationally efficient. We evaluate our method on two tasks: Newton mechanics modeling and equilibrium molecule conformation generation. Extensive experimental results demonstrate that our model achieves the best or competitive performance in two types of datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/du22e/du22e.pdf", "supp": "", "pdf_size": 3635596, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14602440346377958112&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Chinese Academy of Sciences; Xi\u2019an Jiaotong University + Microsoft Research; George Mason University + Microsoft Research; Microsoft Research; Chinese Academy of Sciences + Microsoft Research; Xi\u2019an Jiaotong University; Microsoft Research; Microsoft Research", "aff_domain": "amss.ac.cn; ; ;microsoft.com; ; ; ; ", "email": "amss.ac.cn; ; ;microsoft.com; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/du22e.html", "aff_unique_index": "0;1+2;3+2;2;0+2;1;2;2", "aff_unique_norm": "Chinese Academy of Sciences;Xi'an Jiao Tong University;Microsoft;George Mason University", "aff_unique_dep": ";;Microsoft Research;", "aff_unique_url": "https://www.cas.cn;https://www.xjtu.edu.cn;https://www.microsoft.com/en-us/research;https://www.gmu.edu", "aff_unique_abbr": "CAS;XJTU;MSR;GMU", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;1+1;1;0+1;0;1;1", "aff_country_unique": "China;United States" }, { "title": "SPDY: Accurate Pruning with Speedup Guarantees", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17437", "id": "17437", "proceeding": "https://proceedings.mlr.press/v162/frantar22a.html", "poster": "/media/PosterPDFs/ICML%202022/1819fb9034f796275e6f64950a134e2a.png?t=1658416617.707445", "slides": "", "author_site": "Elias Frantar, Dan Alistarh", "author": "Elias Frantar; Dan Alistarh", "abstract": "The recent focus on the efficiency of deep neural networks (DNNs) has led to significant work on model compression approaches, of which weight pruning is one of the most popular. At the same time, there is rapidly-growing computational support for efficiently executing the unstructured-sparse models obtained via pruning. Yet, most existing pruning methods minimize just the number of remaining weights, i.e. the size of the model, rather than optimizing for inference time. We address this gap by introducing SPDY, a new compression method which automatically determines layer-wise sparsity targets achieving a desired inference speedup on a given system, while minimizing accuracy loss. SPDY is the composition of two new techniques. The first is an efficient and general dynamic programming algorithm for solving constrained layer-wise compression problems, given a set of layer-wise error scores. The second technique is a local search procedure for automatically determining such scores in an accurate and robust manner. Experiments across popular vision and language models show that SPDY guarantees speedups while recovering higher accuracy relative to existing strategies, both for one-shot and gradual pruning scenarios, and is compatible with most existing pruning approaches. We also extend our approach to the recently-proposed task of pruning with very little data, where we achieve the best known accuracy recovery when pruning to the GPU-supported 2:4 sparsity pattern.", "bibtex": "@InProceedings{pmlr-v162-frantar22a,\n title = \t {{SPDY}: Accurate Pruning with Speedup Guarantees},\n author = {Frantar, Elias and Alistarh, Dan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6726--6743},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/frantar22a/frantar22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/frantar22a.html},\n abstract = \t {The recent focus on the efficiency of deep neural networks (DNNs) has led to significant work on model compression approaches, of which weight pruning is one of the most popular. At the same time, there is rapidly-growing computational support for efficiently executing the unstructured-sparse models obtained via pruning. Yet, most existing pruning methods minimize just the number of remaining weights, i.e. the size of the model, rather than optimizing for inference time. We address this gap by introducing SPDY, a new compression method which automatically determines layer-wise sparsity targets achieving a desired inference speedup on a given system, while minimizing accuracy loss. SPDY is the composition of two new techniques. The first is an efficient and general dynamic programming algorithm for solving constrained layer-wise compression problems, given a set of layer-wise error scores. The second technique is a local search procedure for automatically determining such scores in an accurate and robust manner. Experiments across popular vision and language models show that SPDY guarantees speedups while recovering higher accuracy relative to existing strategies, both for one-shot and gradual pruning scenarios, and is compatible with most existing pruning approaches. We also extend our approach to the recently-proposed task of pruning with very little data, where we achieve the best known accuracy recovery when pruning to the GPU-supported 2:4 sparsity pattern.}\n}", "pdf": "https://proceedings.mlr.press/v162/frantar22a/frantar22a.pdf", "supp": "", "pdf_size": 615916, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9481477632006628831&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "IST Austria; IST Austria + Neural Magic", "aff_domain": "ist.ac.at;ist.ac.at", "email": "ist.ac.at;ist.ac.at", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/frantar22a.html", "aff_unique_index": "0;0+1", "aff_unique_norm": "Institute of Science and Technology Austria;Neural Magic", "aff_unique_dep": ";", "aff_unique_url": "https://www.ist.ac.at;", "aff_unique_abbr": "IST Austria;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1", "aff_country_unique": "Austria;United States" }, { "title": "SPECTRE: Spectral Conditioning Helps to Overcome the Expressivity Limits of One-shot Graph Generators", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18393", "id": "18393", "proceeding": "https://proceedings.mlr.press/v162/martinkus22a.html", "poster": "/media/PosterPDFs/ICML%202022/1b388c8b7c863fde3f559142fdc123b0.png?t=1657374691.4983776", "slides": "", "author_site": "Karolis Martinkus, Andreas Loukas, Nathana\u00ebl Perraudin, Roger Wattenhofer", "author": "Karolis Martinkus; Andreas Loukas; Nathana\u00ebl Perraudin; Roger Wattenhofer", "abstract": "We approach the graph generation problem from a spectral perspective by first generating the dominant parts of the graph Laplacian spectrum and then building a graph matching these eigenvalues and eigenvectors. Spectral conditioning allows for direct modeling of the global and local graph structure and helps to overcome the expressivity and mode collapse issues of one-shot graph generators. Our novel GAN, called SPECTRE, enables the one-shot generation of much larger graphs than previously possible with one-shot models. SPECTRE outperforms state-of-the-art deep autoregressive generators in terms of modeling fidelity, while also avoiding expensive sequential generation and dependence on node ordering. A case in point, in sizable synthetic and real-world graphs SPECTRE achieves a 4-to-170 fold improvement over the best competitor that does not overfit and is 23-to-30 times faster than autoregressive generators.", "bibtex": "@InProceedings{pmlr-v162-martinkus22a,\n title = \t {{SPECTRE}: Spectral Conditioning Helps to Overcome the Expressivity Limits of One-shot Graph Generators},\n author = {Martinkus, Karolis and Loukas, Andreas and Perraudin, Nathana{\\\"e}l and Wattenhofer, Roger},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15159--15179},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/martinkus22a/martinkus22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/martinkus22a.html},\n abstract = \t {We approach the graph generation problem from a spectral perspective by first generating the dominant parts of the graph Laplacian spectrum and then building a graph matching these eigenvalues and eigenvectors. Spectral conditioning allows for direct modeling of the global and local graph structure and helps to overcome the expressivity and mode collapse issues of one-shot graph generators. Our novel GAN, called SPECTRE, enables the one-shot generation of much larger graphs than previously possible with one-shot models. SPECTRE outperforms state-of-the-art deep autoregressive generators in terms of modeling fidelity, while also avoiding expensive sequential generation and dependence on node ordering. A case in point, in sizable synthetic and real-world graphs SPECTRE achieves a 4-to-170 fold improvement over the best competitor that does not overfit and is 23-to-30 times faster than autoregressive generators.}\n}", "pdf": "https://proceedings.mlr.press/v162/martinkus22a/martinkus22a.pdf", "supp": "", "pdf_size": 1912858, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12175380990160510944&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "ETH Zurich; EPFL and Prescient Design, Genentech; Swiss Data Science Center; ETH Zurich", "aff_domain": "ethz.com; ; ; ", "email": "ethz.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/martinkus22a.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "ETH Zurich;EPFL;Swiss Data Science Center", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ethz.ch;https://www.epfl.ch;https://www.sds.cern.ch", "aff_unique_abbr": "ETHZ;EPFL;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "SQ-VAE: Variational Bayes on Discrete Representation with Self-annealed Stochastic Quantization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17787", "id": "17787", "proceeding": "https://proceedings.mlr.press/v162/takida22a.html", "poster": "/media/PosterPDFs/ICML%202022/d77c703536718b95308130ff2e5cf9ee_EIzgPu6.png?t=1657866087.2058103", "slides": "", "author_site": "Yuhta Takida, Takashi Shibuya, WeiHsiang Liao, Chieh-Hsin Lai, Junki Ohmura, Toshimitsu Uesaka, Naoki Murata, Shusuke Takahashi, Toshiyuki Kumakura, Yuki Mitsufuji", "author": "Yuhta Takida; Takashi Shibuya; Weihsiang Liao; Chieh-Hsin Lai; Junki Ohmura; Toshimitsu Uesaka; Naoki Murata; Shusuke Takahashi; Toshiyuki Kumakura; Yuki Mitsufuji", "abstract": "One noted issue of vector-quantized variational autoencoder (VQ-VAE) is that the learned discrete representation uses only a fraction of the full capacity of the codebook, also known as codebook collapse. We hypothesize that the training scheme of VQ-VAE, which involves some carefully designed heuristics, underlies this issue. In this paper, we propose a new training scheme that extends the standard VAE via novel stochastic dequantization and quantization, called stochastically quantized variational autoencoder (SQ-VAE). In SQ-VAE, we observe a trend that the quantization is stochastic at the initial stage of the training but gradually converges toward a deterministic quantization, which we call self-annealing. Our experiments show that SQ-VAE improves codebook utilization without using common heuristics. Furthermore, we empirically show that SQ-VAE is superior to VAE and VQ-VAE in vision- and speech-related tasks.", "bibtex": "@InProceedings{pmlr-v162-takida22a,\n title = \t {{SQ}-{VAE}: Variational {B}ayes on Discrete Representation with Self-annealed Stochastic Quantization},\n author = {Takida, Yuhta and Shibuya, Takashi and Liao, Weihsiang and Lai, Chieh-Hsin and Ohmura, Junki and Uesaka, Toshimitsu and Murata, Naoki and Takahashi, Shusuke and Kumakura, Toshiyuki and Mitsufuji, Yuki},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20987--21012},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/takida22a/takida22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/takida22a.html},\n abstract = \t {One noted issue of vector-quantized variational autoencoder (VQ-VAE) is that the learned discrete representation uses only a fraction of the full capacity of the codebook, also known as codebook collapse. We hypothesize that the training scheme of VQ-VAE, which involves some carefully designed heuristics, underlies this issue. In this paper, we propose a new training scheme that extends the standard VAE via novel stochastic dequantization and quantization, called stochastically quantized variational autoencoder (SQ-VAE). In SQ-VAE, we observe a trend that the quantization is stochastic at the initial stage of the training but gradually converges toward a deterministic quantization, which we call self-annealing. Our experiments show that SQ-VAE improves codebook utilization without using common heuristics. Furthermore, we empirically show that SQ-VAE is superior to VAE and VQ-VAE in vision- and speech-related tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/takida22a/takida22a.pdf", "supp": "", "pdf_size": 15845645, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13353459274510421570&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Sony Group Corporation, Japan; Sony Group Corporation, Japan; Sony Group Corporation, Japan; Sony Group Corporation, Japan; Sony Group Corporation, Japan; Sony Group Corporation, Japan; Sony Group Corporation, Japan; Sony Group Corporation, Japan; Sony Corporation of America, USA; Sony Group Corporation, Japan", "aff_domain": "sony.com; ; ; ; ; ; ; ; ; ", "email": "sony.com; ; ; ; ; ; ; ; ; ", "github": "https://github.com/sony/sqvae", "project": "", "author_num": 10, "oa": "https://proceedings.mlr.press/v162/takida22a.html", "aff_unique_index": "0;0;0;0;0;0;0;0;1;0", "aff_unique_norm": "Sony Group Corporation;Sony Corporation of America", "aff_unique_dep": ";", "aff_unique_url": "https://www.sony.com;https://www.sony.com", "aff_unique_abbr": "Sony;Sony", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;1;0", "aff_country_unique": "Japan;United States" }, { "title": "Safe Exploration for Efficient Policy Evaluation and Comparison", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16135", "id": "16135", "proceeding": "https://proceedings.mlr.press/v162/wan22b.html", "poster": "/media/PosterPDFs/ICML%202022/8f85517967795eeef66c225f7883bdcb.png?t=1657508610.7348945", "slides": "", "author_site": "Runzhe Wan, Branislav Kveton, Rui Song", "author": "Runzhe Wan; Branislav Kveton; Rui Song", "abstract": "High-quality data plays a central role in ensuring the accuracy of policy evaluation. This paper initiates the study of efficient and safe data collection for bandit policy evaluation. We formulate the problem and investigate its several representative variants. For each variant, we analyze its statistical properties, derive the corresponding exploration policy, and design an efficient algorithm for computing it. Both theoretical analysis and experiments support the usefulness of the proposed methods.", "bibtex": "@InProceedings{pmlr-v162-wan22b,\n title = \t {Safe Exploration for Efficient Policy Evaluation and Comparison},\n author = {Wan, Runzhe and Kveton, Branislav and Song, Rui},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22491--22511},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wan22b/wan22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/wan22b.html},\n abstract = \t {High-quality data plays a central role in ensuring the accuracy of policy evaluation. This paper initiates the study of efficient and safe data collection for bandit policy evaluation. We formulate the problem and investigate its several representative variants. For each variant, we analyze its statistical properties, derive the corresponding exploration policy, and design an efficient algorithm for computing it. Both theoretical analysis and experiments support the usefulness of the proposed methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/wan22b/wan22b.pdf", "supp": "", "pdf_size": 703710, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14729535773849202022&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Statistics, North Carolina State University; Amazon; Department of Statistics, North Carolina State University", "aff_domain": "ncsu.edu;amazon.com;ncsu.edu", "email": "ncsu.edu;amazon.com;ncsu.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wan22b.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "North Carolina State University;Amazon", "aff_unique_dep": "Department of Statistics;Amazon.com, Inc.", "aff_unique_url": "https://www.ncsu.edu;https://www.amazon.com", "aff_unique_abbr": "NCSU;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Safe Learning in Tree-Form Sequential Decision Making: Handling Hard and Soft Constraints", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17521", "id": "17521", "proceeding": "https://proceedings.mlr.press/v162/bernasconi22a.html", "poster": "", "slides": "", "author_site": "Martino Bernasconi, Federico Cacciamani, Matteo Castiglioni, Alberto Marchesi, Nicola Gatti, Francesco Trov\u00f2", "author": "Martino Bernasconi; Federico Cacciamani; Matteo Castiglioni; Alberto Marchesi; Nicola Gatti; Francesco Trov\u00f2", "abstract": "We study decision making problems in which an agent sequentially interacts with a stochastic environment defined by means of a tree structure. The agent repeatedly faces the environment over time, and, after each round, it perceives a utility and a cost, which are both stochastic. The goal of the agent is to learn an optimal strategy in an online fashion, while, at the same time, keeping costs below a given safety threshold. Our model naturally fits many real-world scenarios, such as, e.g., opponent exploitation in games and web link selection. We study the hard-threshold problem of achieving sublinear regret while guaranteeing that the threshold constraint is satisfied at every iteration with high probability. First, we show that, in general, any algorithm with such a guarantee incurs in a linear regret. This motivates the introduction of a relaxed problem, namely the soft-threshold problem, in which we only require that the cumulative violation of the threshold constraint grows sublinearly, and, thus, we can provide an algorithm with sublinear regret. Next, we show how, in the hard-threshold problem, a sublinear regret algorithm can be designed under the additional assumption that there exists a known strategy strictly satisfying the threshold constraint. We also show that our regret bounds are tight. Finally, we cast the opponent exploitation problem to our model, and we experimentally evaluate our algorithms on a standard testbed of games.", "bibtex": "@InProceedings{pmlr-v162-bernasconi22a,\n title = \t {Safe Learning in Tree-Form Sequential Decision Making: Handling Hard and Soft Constraints},\n author = {Bernasconi, Martino and Cacciamani, Federico and Castiglioni, Matteo and Marchesi, Alberto and Gatti, Nicola and Trov{\\`o}, Francesco},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1854--1873},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bernasconi22a/bernasconi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bernasconi22a.html},\n abstract = \t {We study decision making problems in which an agent sequentially interacts with a stochastic environment defined by means of a tree structure. The agent repeatedly faces the environment over time, and, after each round, it perceives a utility and a cost, which are both stochastic. The goal of the agent is to learn an optimal strategy in an online fashion, while, at the same time, keeping costs below a given safety threshold. Our model naturally fits many real-world scenarios, such as, e.g., opponent exploitation in games and web link selection. We study the hard-threshold problem of achieving sublinear regret while guaranteeing that the threshold constraint is satisfied at every iteration with high probability. First, we show that, in general, any algorithm with such a guarantee incurs in a linear regret. This motivates the introduction of a relaxed problem, namely the soft-threshold problem, in which we only require that the cumulative violation of the threshold constraint grows sublinearly, and, thus, we can provide an algorithm with sublinear regret. Next, we show how, in the hard-threshold problem, a sublinear regret algorithm can be designed under the additional assumption that there exists a known strategy strictly satisfying the threshold constraint. We also show that our regret bounds are tight. Finally, we cast the opponent exploitation problem to our model, and we experimentally evaluate our algorithms on a standard testbed of games.}\n}", "pdf": "https://proceedings.mlr.press/v162/bernasconi22a/bernasconi22a.pdf", "supp": "", "pdf_size": 609671, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14542840124710945063&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Politecnico di Milano; Politecnico di Milano; Politecnico di Milano; Politecnico di Milano; Politecnico di Milano; Politecnico di Milano", "aff_domain": "polimi.it; ; ; ; ; ", "email": "polimi.it; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/bernasconi22a.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Politecnico di Milano", "aff_unique_dep": "", "aff_unique_url": "https://www.polimi.it", "aff_unique_abbr": "Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Italy" }, { "title": "Sample Efficient Learning of Predictors that Complement Humans", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17543", "id": "17543", "proceeding": "https://proceedings.mlr.press/v162/charusaie22a.html", "poster": "/media/PosterPDFs/ICML%202022/32f6c513b25df1c670753eb7335c4258.png?t=1657750126.7844672", "slides": "", "author_site": "Mohammad-Amin Charusaie, Hussein Mozannar, David Sontag, Samira Samadi", "author": "Mohammad-Amin Charusaie; Hussein Mozannar; David Sontag; Samira Samadi", "abstract": "One of the goals of learning algorithms is to complement and reduce the burden on human decision makers. The expert deferral setting wherein an algorithm can either predict on its own or defer the decision to a downstream expert helps accomplish this goal. A fundamental aspect of this setting is the need to learn complementary predictors that improve on the human\u2019s weaknesses rather than learning predictors optimized for average error. In this work, we provide the first theoretical analysis of the benefit of learning complementary predictors in expert deferral. To enable efficiently learning such predictors, we consider a family of consistent surrogate loss functions for expert deferral and analyze their theoretical properties. Finally, we design active learning schemes that require minimal amount of data of human expert predictions in order to learn accurate deferral systems.", "bibtex": "@InProceedings{pmlr-v162-charusaie22a,\n title = \t {Sample Efficient Learning of Predictors that Complement Humans},\n author = {Charusaie, Mohammad-Amin and Mozannar, Hussein and Sontag, David and Samadi, Samira},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2972--3005},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/charusaie22a/charusaie22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/charusaie22a.html},\n abstract = \t {One of the goals of learning algorithms is to complement and reduce the burden on human decision makers. The expert deferral setting wherein an algorithm can either predict on its own or defer the decision to a downstream expert helps accomplish this goal. A fundamental aspect of this setting is the need to learn complementary predictors that improve on the human\u2019s weaknesses rather than learning predictors optimized for average error. In this work, we provide the first theoretical analysis of the benefit of learning complementary predictors in expert deferral. To enable efficiently learning such predictors, we consider a family of consistent surrogate loss functions for expert deferral and analyze their theoretical properties. Finally, we design active learning schemes that require minimal amount of data of human expert predictions in order to learn accurate deferral systems.}\n}", "pdf": "https://proceedings.mlr.press/v162/charusaie22a/charusaie22a.pdf", "supp": "", "pdf_size": 694696, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14604138868272717546&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany+CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA; CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA; CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", "aff_domain": "tuebingen.mpg.de;mit.edu; ; ", "email": "tuebingen.mpg.de;mit.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/charusaie22a.html", "aff_unique_index": "0+1;1;1;0", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Massachusetts Institute of Technology", "aff_unique_dep": ";Computer Science and Artificial Intelligence Laboratory (CSAIL)", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.csail.mit.edu", "aff_unique_abbr": "MPI-IS;MIT", "aff_campus_unique_index": "0+1;1;1;0", "aff_campus_unique": "T\u00fcbingen;Cambridge", "aff_country_unique_index": "0+1;1;1;0", "aff_country_unique": "Germany;United States" }, { "title": "Sample and Communication-Efficient Decentralized Actor-Critic Algorithms with Finite-Time Analysis", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18417", "id": "18417", "proceeding": "https://proceedings.mlr.press/v162/chen22ah.html", "poster": "/media/PosterPDFs/ICML%202022/609e9d4bcc8157c00808993f612f1acd.png?t=1656648126.1382275", "slides": "/media/icml-2022/Slides/18417.pdf", "author_site": "Ziyi Chen, Yi Zhou, Rong-Rong Chen, Shaofeng Zou", "author": "Ziyi Chen; Yi Zhou; Rong-Rong Chen; Shaofeng Zou", "abstract": "Actor-critic (AC) algorithms have been widely used in decentralized multi-agent systems to learn the optimal joint control policy. However, existing decentralized AC algorithms either need to share agents\u2019 sensitive information or lack communication-efficiency. In this work, we develop decentralized AC and natural AC (NAC) algorithms that avoid sharing agents\u2019 local information and are sample and communication-efficient. In both algorithms, agents share only noisy rewards and use mini-batch local policy gradient updates to ensure high sample and communication efficiency. Particularly for decentralized NAC, we develop a decentralized Markovian SGD algorithm with an adaptive mini-batch size to efficiently compute the natural policy gradient. Under Markovian sampling and linear function approximation, we prove that the proposed decentralized AC and NAC algorithms achieve the state-of-the-art sample complexities $\\mathcal{O}(\\epsilon^{-2}\\ln\\epsilon^{-1})$ and $\\mathcal{O}(\\epsilon^{-3}\\ln\\epsilon^{-1})$, respectively, and achieve an improved communication complexity $\\mathcal{O}(\\epsilon^{-1}\\ln\\epsilon^{-1})$. Numerical experiments demonstrate that the proposed algorithms achieve lower sample and communication complexities than the existing decentralized AC algorithms.", "bibtex": "@InProceedings{pmlr-v162-chen22ah,\n title = \t {Sample and Communication-Efficient Decentralized Actor-Critic Algorithms with Finite-Time Analysis},\n author = {Chen, Ziyi and Zhou, Yi and Chen, Rong-Rong and Zou, Shaofeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3794--3834},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22ah/chen22ah.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22ah.html},\n abstract = \t {Actor-critic (AC) algorithms have been widely used in decentralized multi-agent systems to learn the optimal joint control policy. However, existing decentralized AC algorithms either need to share agents\u2019 sensitive information or lack communication-efficiency. In this work, we develop decentralized AC and natural AC (NAC) algorithms that avoid sharing agents\u2019 local information and are sample and communication-efficient. In both algorithms, agents share only noisy rewards and use mini-batch local policy gradient updates to ensure high sample and communication efficiency. Particularly for decentralized NAC, we develop a decentralized Markovian SGD algorithm with an adaptive mini-batch size to efficiently compute the natural policy gradient. Under Markovian sampling and linear function approximation, we prove that the proposed decentralized AC and NAC algorithms achieve the state-of-the-art sample complexities $\\mathcal{O}(\\epsilon^{-2}\\ln\\epsilon^{-1})$ and $\\mathcal{O}(\\epsilon^{-3}\\ln\\epsilon^{-1})$, respectively, and achieve an improved communication complexity $\\mathcal{O}(\\epsilon^{-1}\\ln\\epsilon^{-1})$. Numerical experiments demonstrate that the proposed algorithms achieve lower sample and communication complexities than the existing decentralized AC algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22ah/chen22ah.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chen22ah-supp.zip", "pdf_size": 3769708, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7961220597958783817&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Electrical and Computer Engineering, University of Utah; Department of Electrical and Computer Engineering, University of Utah; Department of Electrical and Computer Engineering, University of Utah; Department of Electrical Engineering, University at Buffalo", "aff_domain": "utah.edu; ; ; ", "email": "utah.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/chen22ah.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of Utah;University at Buffalo", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Electrical Engineering", "aff_unique_url": "https://www.utah.edu;https://www.buffalo.edu", "aff_unique_abbr": "Utah;UB", "aff_campus_unique_index": "1", "aff_campus_unique": ";Buffalo", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sample-Efficient Reinforcement Learning with loglog(T) Switching Cost", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16725", "id": "16725", "proceeding": "https://proceedings.mlr.press/v162/qiao22a.html", "poster": "/media/PosterPDFs/ICML%202022/124c3e4ada4a529aa0fedece80bb42ab.png?t=1657484756.2561657", "slides": "", "author_site": "Dan Qiao, Ming Yin, Ming Min, Yu-Xiang Wang", "author": "Dan Qiao; Ming Yin; Ming Min; Yu-Xiang Wang", "abstract": "We study the problem of reinforcement learning (RL) with low (policy) switching cost {\u2014} a problem well-motivated by real-life RL applications in which deployments of new policies are costly and the number of policy updates must be low. In this paper, we propose a new algorithm based on stage-wise exploration and adaptive policy elimination that achieves a regret of $\\widetilde{O}(\\sqrt{H^4S^2AT})$ while requiring a switching cost of $O(HSA \\log\\log T)$. This is an exponential improvement over the best-known switching cost $O(H^2SA\\log T)$ among existing methods with $\\widetilde{O}(\\mathrm{poly}(H,S,A)\\sqrt{T})$ regret. In the above, $S,A$ denotes the number of states and actions in an $H$-horizon episodic Markov Decision Process model with unknown transitions, and $T$ is the number of steps. As a byproduct of our new techniques, we also derive a reward-free exploration algorithm with a switching cost of $O(HSA)$. Furthermore, we prove a pair of information-theoretical lower bounds which say that (1) Any no-regret algorithm must have a switching cost of $\\Omega(HSA)$; (2) Any $\\widetilde{O}(\\sqrt{T})$ regret algorithm must incur a switching cost of $\\Omega(HSA\\log\\log T)$. Both our algorithms are thus optimal in their switching costs.", "bibtex": "@InProceedings{pmlr-v162-qiao22a,\n title = \t {Sample-Efficient Reinforcement Learning with loglog({T}) Switching Cost},\n author = {Qiao, Dan and Yin, Ming and Min, Ming and Wang, Yu-Xiang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18031--18061},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qiao22a/qiao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/qiao22a.html},\n abstract = \t {We study the problem of reinforcement learning (RL) with low (policy) switching cost {\u2014} a problem well-motivated by real-life RL applications in which deployments of new policies are costly and the number of policy updates must be low. In this paper, we propose a new algorithm based on stage-wise exploration and adaptive policy elimination that achieves a regret of $\\widetilde{O}(\\sqrt{H^4S^2AT})$ while requiring a switching cost of $O(HSA \\log\\log T)$. This is an exponential improvement over the best-known switching cost $O(H^2SA\\log T)$ among existing methods with $\\widetilde{O}(\\mathrm{poly}(H,S,A)\\sqrt{T})$ regret. In the above, $S,A$ denotes the number of states and actions in an $H$-horizon episodic Markov Decision Process model with unknown transitions, and $T$ is the number of steps. As a byproduct of our new techniques, we also derive a reward-free exploration algorithm with a switching cost of $O(HSA)$. Furthermore, we prove a pair of information-theoretical lower bounds which say that (1) Any no-regret algorithm must have a switching cost of $\\Omega(HSA)$; (2) Any $\\widetilde{O}(\\sqrt{T})$ regret algorithm must incur a switching cost of $\\Omega(HSA\\log\\log T)$. Both our algorithms are thus optimal in their switching costs.}\n}", "pdf": "https://proceedings.mlr.press/v162/qiao22a/qiao22a.pdf", "supp": "", "pdf_size": 545372, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17155444875107709541&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, UC Santa Barbara + Department of Statistics and Applied Probability, UC Santa Barbara; Department of Computer Science, UC Santa Barbara + Department of Statistics and Applied Probability, UC Santa Barbara; Department of Statistics and Applied Probability, UC Santa Barbara; Department of Computer Science, UC Santa Barbara", "aff_domain": "ucsb.edu; ; ;cs.ucsb.edu", "email": "ucsb.edu; ; ;cs.ucsb.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/qiao22a.html", "aff_unique_index": "0+0;0+0;0;0", "aff_unique_norm": "University of California, Santa Barbara", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ucsb.edu", "aff_unique_abbr": "UCSB", "aff_campus_unique_index": "0+0;0+0;0;0", "aff_campus_unique": "Santa Barbara", "aff_country_unique_index": "0+0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "Sanity Simulations for Saliency Methods", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16383", "id": "16383", "proceeding": "https://proceedings.mlr.press/v162/kim22h.html", "poster": "/media/PosterPDFs/ICML%202022/9381fc93ad66f9ec4b2eef71147a6665.png?t=1657205120.1647294", "slides": "", "author_site": "Joon Kim, Gregory Plumb, Ameet Talwalkar", "author": "Joon Sik Kim; Gregory Plumb; Ameet Talwalkar", "abstract": "Saliency methods are a popular class of feature attribution explanation methods that aim to capture a model\u2019s predictive reasoning by identifying \"important\" pixels in an input image. However, the development and adoption of these methods are hindered by the lack of access to ground-truth model reasoning, which prevents accurate evaluation. In this work, we design a synthetic benchmarking framework, SMERF, that allows us to perform ground-truth-based evaluation while controlling the complexity of the model\u2019s reasoning. Experimentally, SMERF reveals significant limitations in existing saliency methods and, as a result, represents a useful tool for the development of new saliency methods.", "bibtex": "@InProceedings{pmlr-v162-kim22h,\n title = \t {Sanity Simulations for Saliency Methods},\n author = {Kim, Joon Sik and Plumb, Gregory and Talwalkar, Ameet},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11173--11200},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kim22h/kim22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/kim22h.html},\n abstract = \t {Saliency methods are a popular class of feature attribution explanation methods that aim to capture a model\u2019s predictive reasoning by identifying \"important\" pixels in an input image. However, the development and adoption of these methods are hindered by the lack of access to ground-truth model reasoning, which prevents accurate evaluation. In this work, we design a synthetic benchmarking framework, SMERF, that allows us to perform ground-truth-based evaluation while controlling the complexity of the model\u2019s reasoning. Experimentally, SMERF reveals significant limitations in existing saliency methods and, as a result, represents a useful tool for the development of new saliency methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/kim22h/kim22h.pdf", "supp": "", "pdf_size": 7237390, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7944058318921349973&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Machine Learning Department, Carnegie Mellon University, Pittsburgh, USA; Machine Learning Department, Carnegie Mellon University, Pittsburgh, USA; Machine Learning Department, Carnegie Mellon University, Pittsburgh, USA", "aff_domain": "cmu.edu; ; ", "email": "cmu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kim22h.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "Machine Learning Department", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Saute RL: Almost Surely Safe Reinforcement Learning Using State Augmentation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16981", "id": "16981", "proceeding": "https://proceedings.mlr.press/v162/sootla22a.html", "poster": "/media/PosterPDFs/ICML%202022/0ce2ffd21fc958d9ef0ee9ba5336e357.png?t=1655995646.0160847", "slides": "/media/icml-2022/Slides/16981_nazm5yp.pdf", "author_site": "Aivar Sootla, Alexander I Cowen-Rivers, Taher Jafferjee, Ziyan Wang, David Mguni, Jun Wang, Haitham Bou Ammar", "author": "Aivar Sootla; Alexander I Cowen-Rivers; Taher Jafferjee; Ziyan Wang; David H Mguni; Jun Wang; Haitham Ammar", "abstract": "Satisfying safety constraints almost surely (or with probability one) can be critical for the deployment of Reinforcement Learning (RL) in real-life applications. For example, plane landing and take-off should ideally occur with probability one. We address the problem by introducing Safety Augmented (Saute) Markov Decision Processes (MDPs), where the safety constraints are eliminated by augmenting them into the state-space and reshaping the objective. We show that Saute MDP satisfies the Bellman equation and moves us closer to solving Safe RL with constraints satisfied almost surely. We argue that Saute MDP allows viewing the Safe RL problem from a different perspective enabling new features. For instance, our approach has a plug-and-play nature, i.e., any RL algorithm can be \"Sauteed\u201d. Additionally, state augmentation allows for policy generalization across safety constraints. We finally show that Saute RL algorithms can outperform their state-of-the-art counterparts when constraint satisfaction is of high importance.", "bibtex": "@InProceedings{pmlr-v162-sootla22a,\n title = \t {Saute {RL}: Almost Surely Safe Reinforcement Learning Using State Augmentation},\n author = {Sootla, Aivar and Cowen-Rivers, Alexander I and Jafferjee, Taher and Wang, Ziyan and Mguni, David H and Wang, Jun and Ammar, Haitham},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20423--20443},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sootla22a/sootla22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sootla22a.html},\n abstract = \t {Satisfying safety constraints almost surely (or with probability one) can be critical for the deployment of Reinforcement Learning (RL) in real-life applications. For example, plane landing and take-off should ideally occur with probability one. We address the problem by introducing Safety Augmented (Saute) Markov Decision Processes (MDPs), where the safety constraints are eliminated by augmenting them into the state-space and reshaping the objective. We show that Saute MDP satisfies the Bellman equation and moves us closer to solving Safe RL with constraints satisfied almost surely. We argue that Saute MDP allows viewing the Safe RL problem from a different perspective enabling new features. For instance, our approach has a plug-and-play nature, i.e., any RL algorithm can be \"Sauteed\u201d. Additionally, state augmentation allows for policy generalization across safety constraints. We finally show that Saute RL algorithms can outperform their state-of-the-art counterparts when constraint satisfaction is of high importance.}\n}", "pdf": "https://proceedings.mlr.press/v162/sootla22a/sootla22a.pdf", "supp": "", "pdf_size": 3766573, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12545517423097788852&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Huawei R&D UK; Huawei R&D UK + Technische Universit\u00e4t Darmstadt; Huawei R&D UK; Huawei R&D UK; Huawei R&D UK; University College London; Huawei R&D UK + University College London", "aff_domain": "huawei.com; ; ; ; ;cs.ucl.ac.uk;huawei.com", "email": "huawei.com; ; ; ; ;cs.ucl.ac.uk;huawei.com", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/sootla22a.html", "aff_unique_index": "0;0+1;0;0;0;2;0+2", "aff_unique_norm": "Huawei;Technische Universit\u00e4t Darmstadt;University College London", "aff_unique_dep": "R&D;;", "aff_unique_url": "https://www.huawei.com/uk;https://www.tu-darmstadt.de;https://www.ucl.ac.uk", "aff_unique_abbr": "Huawei;TUD;UCL", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;0;0;0;0;0+0", "aff_country_unique": "United Kingdom;Germany" }, { "title": "Scalable Computation of Causal Bounds", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16297", "id": "16297", "proceeding": "https://proceedings.mlr.press/v162/shridharan22a.html", "poster": "/media/PosterPDFs/ICML%202022/92cf3f7ef90630755b955924254e6ec4.png?t=1657591598.8026853", "slides": "", "author_site": "Madhumitha Shridharan, Garud Iyengar", "author": "Madhumitha Shridharan; Garud Iyengar", "abstract": "We consider the problem of computing bounds for causal inference problems with unobserved confounders, where identifiability does not hold. Existing non-parametric approaches for computing such bounds use linear programming (LP) formulations that quickly become intractable for existing solvers because the size of the LP grows exponentially in the number of edges in the underlying causal graph. We show that this LP can be significantly pruned by carefully considering the structure of the causal query, allowing us to compute bounds for significantly larger causal inference problems as compared to what is possible using existing techniques. This pruning procedure also allows us to compute the bounds in closed form for a special class of causal graphs and queries, which includes a well-studied family of problems where multiple confounded treatments influence an outcome. We also propose a very efficient greedy heuristic that produces very high quality bounds, and scales to problems that are several orders of magnitude larger than those for which the pruned LP can be solved.", "bibtex": "@InProceedings{pmlr-v162-shridharan22a,\n title = \t {Scalable Computation of Causal Bounds},\n author = {Shridharan, Madhumitha and Iyengar, Garud},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20125--20140},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shridharan22a/shridharan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/shridharan22a.html},\n abstract = \t {We consider the problem of computing bounds for causal inference problems with unobserved confounders, where identifiability does not hold. Existing non-parametric approaches for computing such bounds use linear programming (LP) formulations that quickly become intractable for existing solvers because the size of the LP grows exponentially in the number of edges in the underlying causal graph. We show that this LP can be significantly pruned by carefully considering the structure of the causal query, allowing us to compute bounds for significantly larger causal inference problems as compared to what is possible using existing techniques. This pruning procedure also allows us to compute the bounds in closed form for a special class of causal graphs and queries, which includes a well-studied family of problems where multiple confounded treatments influence an outcome. We also propose a very efficient greedy heuristic that produces very high quality bounds, and scales to problems that are several orders of magnitude larger than those for which the pruned LP can be solved.}\n}", "pdf": "https://proceedings.mlr.press/v162/shridharan22a/shridharan22a.pdf", "supp": "", "pdf_size": 457994, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8640439242503228154&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Industrial Engineering and Operations Research, Columbia University, New York, USA; Department of Industrial Engineering and Operations Research, Columbia University, New York, USA", "aff_domain": "columbia.edu;ieor.columbia.edu", "email": "columbia.edu;ieor.columbia.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/shridharan22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "Department of Industrial Engineering and Operations Research", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Scalable Deep Gaussian Markov Random Fields for General Graphs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17281", "id": "17281", "proceeding": "https://proceedings.mlr.press/v162/oskarsson22a.html", "poster": "/media/PosterPDFs/ICML%202022/4a47d2983c8bd392b120b627e0e1cab4.png?t=1657559798.1740797", "slides": "", "author_site": "Joel Oskarsson, Per Sid\u00e9n, Fredrik Lindsten", "author": "Joel Oskarsson; Per Sid\u00e9n; Fredrik Lindsten", "abstract": "Machine learning methods on graphs have proven useful in many applications due to their ability to handle generally structured data. The framework of Gaussian Markov Random Fields (GMRFs) provides a principled way to define Gaussian models on graphs by utilizing their sparsity structure. We propose a flexible GMRF model for general graphs built on the multi-layer structure of Deep GMRFs, originally proposed for lattice graphs only. By designing a new type of layer we enable the model to scale to large graphs. The layer is constructed to allow for efficient training using variational inference and existing software frameworks for Graph Neural Networks. For a Gaussian likelihood, close to exact Bayesian inference is available for the latent field. This allows for making predictions with accompanying uncertainty estimates. The usefulness of the proposed model is verified by experiments on a number of synthetic and real world datasets, where it compares favorably to other both Bayesian and deep learning methods.", "bibtex": "@InProceedings{pmlr-v162-oskarsson22a,\n title = \t {Scalable Deep {G}aussian {M}arkov Random Fields for General Graphs},\n author = {Oskarsson, Joel and Sid{\\'e}n, Per and Lindsten, Fredrik},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17117--17137},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/oskarsson22a/oskarsson22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/oskarsson22a.html},\n abstract = \t {Machine learning methods on graphs have proven useful in many applications due to their ability to handle generally structured data. The framework of Gaussian Markov Random Fields (GMRFs) provides a principled way to define Gaussian models on graphs by utilizing their sparsity structure. We propose a flexible GMRF model for general graphs built on the multi-layer structure of Deep GMRFs, originally proposed for lattice graphs only. By designing a new type of layer we enable the model to scale to large graphs. The layer is constructed to allow for efficient training using variational inference and existing software frameworks for Graph Neural Networks. For a Gaussian likelihood, close to exact Bayesian inference is available for the latent field. This allows for making predictions with accompanying uncertainty estimates. The usefulness of the proposed model is verified by experiments on a number of synthetic and real world datasets, where it compares favorably to other both Bayesian and deep learning methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/oskarsson22a/oskarsson22a.pdf", "supp": "", "pdf_size": 2501773, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16619238478793238405&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Division of Statistics and Machine Learning, Department of Computer and Information Science, Link \u00a8oping University, Link \u00a8oping, Sweden; Division of Statistics and Machine Learning, Department of Computer and Information Science, Link \u00a8oping University, Link \u00a8oping, Sweden + Arriver Software AB; Division of Statistics and Machine Learning, Department of Computer and Information Science, Link \u00a8oping University, Link \u00a8oping, Sweden", "aff_domain": "liu.se; ; ", "email": "liu.se; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/oskarsson22a.html", "aff_unique_index": "0;0+1;0", "aff_unique_norm": "Link\u00f6ping University;Arriver Software", "aff_unique_dep": "Department of Computer and Information Science;", "aff_unique_url": "https://www.liu.se;https://www.arriver.com", "aff_unique_abbr": "LiU;Arriver", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Link\u00f6ping;", "aff_country_unique_index": "0;0+0;0", "aff_country_unique": "Sweden" }, { "title": "Scalable Deep Reinforcement Learning Algorithms for Mean Field Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17457", "id": "17457", "proceeding": "https://proceedings.mlr.press/v162/lauriere22a.html", "poster": "/media/PosterPDFs/ICML%202022/3016a447172f3045b65f5fc83e04b554.png?t=1658180054.2021377", "slides": "", "author_site": "Mathieu Lauriere, Sarah Perrin, Sertan Girgin, Paul Muller, Ayush Jain, Theophile Cabannes, Georgios Piliouras, Julien Perolat, Romuald Elie, Olivier Pietquin, Matthieu Geist", "author": "Mathieu Lauriere; Sarah Perrin; Sertan Girgin; Paul Muller; Ayush Jain; Theophile Cabannes; Georgios Piliouras; Julien Perolat; Romuald Elie; Olivier Pietquin; Matthieu Geist", "abstract": "Mean Field Games (MFGs) have been introduced to efficiently approximate games with very large populations of strategic agents. Recently, the question of learning equilibria in MFGs has gained momentum, particularly using model-free reinforcement learning (RL) methods. One limiting factor to further scale up using RL is that existing algorithms to solve MFGs require the mixing of approximated quantities such as strategies or $q$-values. This is far from being trivial in the case of non-linear function approximation that enjoy good generalization properties,", "bibtex": "@InProceedings{pmlr-v162-lauriere22a,\n title = \t {Scalable Deep Reinforcement Learning Algorithms for Mean Field Games},\n author = {Lauriere, Mathieu and Perrin, Sarah and Girgin, Sertan and Muller, Paul and Jain, Ayush and Cabannes, Theophile and Piliouras, Georgios and Perolat, Julien and Elie, Romuald and Pietquin, Olivier and Geist, Matthieu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12078--12095},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lauriere22a/lauriere22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lauriere22a.html},\n abstract = \t {Mean Field Games (MFGs) have been introduced to efficiently approximate games with very large populations of strategic agents. Recently, the question of learning equilibria in MFGs has gained momentum, particularly using model-free reinforcement learning (RL) methods. One limiting factor to further scale up using RL is that existing algorithms to solve MFGs require the mixing of approximated quantities such as strategies or $q$-values. This is far from being trivial in the case of non-linear function approximation that enjoy good generalization properties,", "pdf": "https://proceedings.mlr.press/v162/lauriere22a/lauriere22a.pdf", "supp": "", "pdf_size": 782376, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6774586953969569962&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "NYU Shanghai, China; Univ. Lille, CNRS, Inria, Centrale Lille, UMR 9189 CRIStAL, France; Google Research; DeepMind; UC Berkeley, California, USA; Singapore University of Technology and Design, Singapore; DeepMind; DeepMind; DeepMind; Google Research; Google Research", "aff_domain": "nyu.edu; ; ; ; ; ; ; ; ; ; ", "email": "nyu.edu; ; ; ; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 11, "oa": "https://proceedings.mlr.press/v162/lauriere22a.html", "aff_unique_index": "0;1;2;3;4;5;3;3;3;2;2", "aff_unique_norm": "New York University Shanghai;University of Lille;Google;DeepMind;University of California, Berkeley;Singapore University of Technology and Design", "aff_unique_dep": ";UMR 9189 CRIStAL;Google Research;;;", "aff_unique_url": "https://shanghai.nyu.edu;https://www.univ-lille.fr;https://research.google;https://deepmind.com;https://www.berkeley.edu;https://www.sutd.edu.sg", "aff_unique_abbr": "NYU Shanghai;Univ. Lille;Google Research;DeepMind;UC Berkeley;SUTD", "aff_campus_unique_index": "0;2;3;2;2", "aff_campus_unique": "Shanghai;;Mountain View;Berkeley", "aff_country_unique_index": "0;1;2;3;2;4;3;3;3;2;2", "aff_country_unique": "China;France;United States;United Kingdom;Singapore" }, { "title": "Scalable First-Order Bayesian Optimization via Structured Automatic Differentiation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18413", "id": "18413", "proceeding": "https://proceedings.mlr.press/v162/ament22a.html", "poster": "/media/PosterPDFs/ICML%202022/2ea279ca696946aceb4337fb1ba9b23a.png?t=1658158961.2536683", "slides": "", "author_site": "Sebastian Ament, Carla Gomes", "author": "Sebastian E Ament; Carla P Gomes", "abstract": "Bayesian Optimization (BO) has shown great promise for the global optimization of functions that are expensive to evaluate, but despite many successes, standard approaches can struggle in high dimensions. To improve the performance of BO, prior work suggested incorporating gradient information into a Gaussian process surrogate of the objective, giving rise to kernel matrices of size $nd$ {\\texttimes} $nd$ for $n$ observations in $d$ dimensions. Na\u0131\u0308vely multiplying with (resp. inverting) these matrices requires $O(n^2d^2)$ (resp. $O(n^3d^3)$) operations, which becomes infeasible for moderate dimensions and sample sizes. Here, we observe that a wide range of kernels gives rise to structured matrices, enabling an exact $O(n^2d)$ matrix-vector multiply for gradient observations and $O(n^2d^2)$ for Hessian observations. Beyond canonical kernel classes, we derive a programmatic approach to leveraging this type of structure for transformations and combinations of the discussed kernel classes, which constitutes a structure-aware automatic differentiation algorithm. Our methods apply to virtually all canonical kernels and automatically extend to complex kernels, like the neural network, radial basis function network, and spectral mixture kernels without any additional derivations, enabling flexible, problem-dependent modeling while scaling first-order BO to high $d$.", "bibtex": "@InProceedings{pmlr-v162-ament22a,\n title = \t {Scalable First-Order {B}ayesian Optimization via Structured Automatic Differentiation},\n author = {Ament, Sebastian E and Gomes, Carla P},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {500--516},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ament22a/ament22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ament22a.html},\n abstract = \t {Bayesian Optimization (BO) has shown great promise for the global optimization of functions that are expensive to evaluate, but despite many successes, standard approaches can struggle in high dimensions. To improve the performance of BO, prior work suggested incorporating gradient information into a Gaussian process surrogate of the objective, giving rise to kernel matrices of size $nd$ {\\texttimes} $nd$ for $n$ observations in $d$ dimensions. Na\u0131\u0308vely multiplying with (resp. inverting) these matrices requires $O(n^2d^2)$ (resp. $O(n^3d^3)$) operations, which becomes infeasible for moderate dimensions and sample sizes. Here, we observe that a wide range of kernels gives rise to structured matrices, enabling an exact $O(n^2d)$ matrix-vector multiply for gradient observations and $O(n^2d^2)$ for Hessian observations. Beyond canonical kernel classes, we derive a programmatic approach to leveraging this type of structure for transformations and combinations of the discussed kernel classes, which constitutes a structure-aware automatic differentiation algorithm. Our methods apply to virtually all canonical kernels and automatically extend to complex kernels, like the neural network, radial basis function network, and spectral mixture kernels without any additional derivations, enabling flexible, problem-dependent modeling while scaling first-order BO to high $d$.}\n}", "pdf": "https://proceedings.mlr.press/v162/ament22a/ament22a.pdf", "supp": "", "pdf_size": 4339043, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17864781963029193260&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Cornell University, Ithaca, NY, 14850, USA; Department of Computer Science, Cornell University, Ithaca, NY, 14850, USA", "aff_domain": "cs.cornell.edu; ", "email": "cs.cornell.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/ament22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ithaca", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Scalable MCMC Sampling for Nonsymmetric Determinantal Point Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17671", "id": "17671", "proceeding": "https://proceedings.mlr.press/v162/han22b.html", "poster": "/media/PosterPDFs/ICML%202022/861637a425ef06e6d539aaaff113d1d5.png?t=1657728568.4597423", "slides": "", "author_site": "Insu Han, Mike Gartrell, Elvis Dohmatob, Amin Karbasi", "author": "Insu Han; Mike Gartrell; Elvis Dohmatob; Amin Karbasi", "abstract": "A determinantal point process (DPP) is an elegant model that assigns a probability to every subset of a collection of $n$ items. While conventionally a DPP is parameterized by a symmetric kernel matrix, removing this symmetry constraint, resulting in nonsymmetric DPPs (NDPPs), leads to significant improvements in modeling power and predictive performance. Recent work has studied an approximate Markov chain Monte Carlo (MCMC) sampling algorithm for NDPPs restricted to size-$k$ subsets (called $k$-NDPPs). However, the runtime of this approach is quadratic in $n$, making it infeasible for large-scale settings. In this work, we develop a scalable MCMC sampling algorithm for $k$-NDPPs with low-rank kernels, thus enabling runtime that is sublinear in $n$. Our method is based on a state-of-the-art NDPP rejection sampling algorithm, which we enhance with a novel approach for efficiently constructing the proposal distribution. Furthermore, we extend our scalable $k$-NDPP sampling algorithm to NDPPs without size constraints. Our resulting sampling method has polynomial time complexity in the rank of the kernel, while the existing approach has runtime that is exponential in the rank. With both a theoretical analysis and experiments on real-world datasets, we verify that our scalable approximate sampling algorithms are orders of magnitude faster than existing sampling approaches for $k$-NDPPs and NDPPs.", "bibtex": "@InProceedings{pmlr-v162-han22b,\n title = \t {Scalable {MCMC} Sampling for Nonsymmetric Determinantal Point Processes},\n author = {Han, Insu and Gartrell, Mike and Dohmatob, Elvis and Karbasi, Amin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8213--8229},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/han22b/han22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/han22b.html},\n abstract = \t {A determinantal point process (DPP) is an elegant model that assigns a probability to every subset of a collection of $n$ items. While conventionally a DPP is parameterized by a symmetric kernel matrix, removing this symmetry constraint, resulting in nonsymmetric DPPs (NDPPs), leads to significant improvements in modeling power and predictive performance. Recent work has studied an approximate Markov chain Monte Carlo (MCMC) sampling algorithm for NDPPs restricted to size-$k$ subsets (called $k$-NDPPs). However, the runtime of this approach is quadratic in $n$, making it infeasible for large-scale settings. In this work, we develop a scalable MCMC sampling algorithm for $k$-NDPPs with low-rank kernels, thus enabling runtime that is sublinear in $n$. Our method is based on a state-of-the-art NDPP rejection sampling algorithm, which we enhance with a novel approach for efficiently constructing the proposal distribution. Furthermore, we extend our scalable $k$-NDPP sampling algorithm to NDPPs without size constraints. Our resulting sampling method has polynomial time complexity in the rank of the kernel, while the existing approach has runtime that is exponential in the rank. With both a theoretical analysis and experiments on real-world datasets, we verify that our scalable approximate sampling algorithms are orders of magnitude faster than existing sampling approaches for $k$-NDPPs and NDPPs.}\n}", "pdf": "https://proceedings.mlr.press/v162/han22b/han22b.pdf", "supp": "", "pdf_size": 608628, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=280717695600419200&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Yale University; Criteo AI Lab, Paris, France; Facebook AI Lab, Paris, France; Yale University", "aff_domain": "yale.edu;criteo.com; ;yale.edu", "email": "yale.edu;criteo.com; ;yale.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/han22b.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Yale University;Criteo;Meta", "aff_unique_dep": ";Criteo AI Lab;Facebook AI Lab", "aff_unique_url": "https://www.yale.edu;https://www.criteo.com;https://ai.facebook.com", "aff_unique_abbr": "Yale;Criteo;Facebook AI", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;France" }, { "title": "Scalable Spike-and-Slab", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17225", "id": "17225", "proceeding": "https://proceedings.mlr.press/v162/biswas22a.html", "poster": "", "slides": "", "author_site": "Niloy Biswas, Lester Mackey, Xiao-Li Meng", "author": "Niloy Biswas; Lester Mackey; Xiao-Li Meng", "abstract": "Spike-and-slab priors are commonly used for Bayesian variable selection, due to their interpretability and favorable statistical properties. However, existing samplers for spike-and-slab posteriors incur prohibitive computational costs when the number of variables is large. In this article, we propose Scalable Spike-and-Slab (S^3), a scalable Gibbs sampling implementation for high-dimensional Bayesian regression with the continuous spike-and-slab prior of George & McCulloch (1993). For a dataset with n observations and p covariates, S^3 has order max{n^2 p_t, np} computational cost at iteration t where p_t never exceeds the number of covariates switching spike-and-slab states between iterations t and t-1 of the Markov chain. This improves upon the order n^2 p per-iteration cost of state-of-the-art implementations as, typically, p_t is substantially smaller than p. We apply S^3 on synthetic and real-world datasets, demonstrating orders of magnitude speed-ups over existing exact samplers and significant gains in inferential quality over approximate samplers with comparable cost.", "bibtex": "@InProceedings{pmlr-v162-biswas22a,\n title = \t {Scalable Spike-and-Slab},\n author = {Biswas, Niloy and Mackey, Lester and Meng, Xiao-Li},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2021--2040},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/biswas22a/biswas22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/biswas22a.html},\n abstract = \t {Spike-and-slab priors are commonly used for Bayesian variable selection, due to their interpretability and favorable statistical properties. However, existing samplers for spike-and-slab posteriors incur prohibitive computational costs when the number of variables is large. In this article, we propose Scalable Spike-and-Slab (S^3), a scalable Gibbs sampling implementation for high-dimensional Bayesian regression with the continuous spike-and-slab prior of George & McCulloch (1993). For a dataset with n observations and p covariates, S^3 has order max{n^2 p_t, np} computational cost at iteration t where p_t never exceeds the number of covariates switching spike-and-slab states between iterations t and t-1 of the Markov chain. This improves upon the order n^2 p per-iteration cost of state-of-the-art implementations as, typically, p_t is substantially smaller than p. We apply S^3 on synthetic and real-world datasets, demonstrating orders of magnitude speed-ups over existing exact samplers and significant gains in inferential quality over approximate samplers with comparable cost.}\n}", "pdf": "https://proceedings.mlr.press/v162/biswas22a/biswas22a.pdf", "supp": "", "pdf_size": 3558813, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14911861538168658848&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Statistics, Harvard University; Microsoft Research New England; Department of Statistics, Harvard University", "aff_domain": "g.harvard.edu; ; ", "email": "g.harvard.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/biswas22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Harvard University;Microsoft", "aff_unique_dep": "Department of Statistics;Microsoft Research", "aff_unique_url": "https://www.harvard.edu;https://www.microsoft.com/en-us/research/group/microsoft-research-new-england", "aff_unique_abbr": "Harvard;MSR NE", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Cambridge;New England", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Scaling Gaussian Process Optimization by Evaluating a Few Unique Candidates Multiple Times", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16537", "id": "16537", "proceeding": "https://proceedings.mlr.press/v162/calandriello22a.html", "poster": "", "slides": "", "author_site": "Daniele Calandriello, Luigi Carratino, Alessandro Lazaric, Michal Valko, Lorenzo Rosasco", "author": "Daniele Calandriello; Luigi Carratino; Alessandro Lazaric; Michal Valko; Lorenzo Rosasco", "abstract": "Computing a Gaussian process (GP) posterior has a computational cost cubical in the number of historical points. A reformulation of the same GP posterior highlights that this complexity mainly depends on how many", "bibtex": "@InProceedings{pmlr-v162-calandriello22a,\n title = \t {Scaling {G}aussian Process Optimization by Evaluating a Few Unique Candidates Multiple Times},\n author = {Calandriello, Daniele and Carratino, Luigi and Lazaric, Alessandro and Valko, Michal and Rosasco, Lorenzo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2523--2541},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/calandriello22a/calandriello22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/calandriello22a.html},\n abstract = \t {Computing a Gaussian process (GP) posterior has a computational cost cubical in the number of historical points. A reformulation of the same GP posterior highlights that this complexity mainly depends on how many", "pdf": "https://proceedings.mlr.press/v162/calandriello22a/calandriello22a.pdf", "supp": "", "pdf_size": 720598, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6988962023318036499&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "DeepMind; MaLGa, DIBRIS, University of Genova, Italy; Meta AI; Massachusetts Institute of Technology; Istituto Italiano di Tecnologia", "aff_domain": "google.com; ; ; ; ", "email": "google.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/calandriello22a.html", "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "DeepMind;University of Genova;Meta;Massachusetts Institute of Technology;Istituto Italiano di Tecnologia", "aff_unique_dep": ";DIBRIS;Meta AI;;", "aff_unique_url": "https://deepmind.com;https://www.unige.it;https://meta.com;https://web.mit.edu;https://www.iit.it", "aff_unique_abbr": "DeepMind;UniGe;Meta;MIT;IIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2;1", "aff_country_unique": "United Kingdom;Italy;United States" }, { "title": "Scaling Out-of-Distribution Detection for Real-World Settings", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16907", "id": "16907", "proceeding": "https://proceedings.mlr.press/v162/hendrycks22a.html", "poster": "/media/PosterPDFs/ICML%202022/cb463625fc9dde2d82207e15bde1b674.png?t=1658210450.4602427", "slides": "", "author_site": "Dan Hendrycks, Steven Basart, Mantas Mazeika, Andy Zou, joseph kwon, Mohammadreza Mostajabi, Jacob Steinhardt, Dawn Song", "author": "Dan Hendrycks; Steven Basart; Mantas Mazeika; Andy Zou; Joseph Kwon; Mohammadreza Mostajabi; Jacob Steinhardt; Dawn Song", "abstract": "Detecting out-of-distribution examples is important for safety-critical machine learning applications such as detecting novel biological phenomena and self-driving cars. However, existing research mainly focuses on simple small-scale settings. To set the stage for more realistic out-of-distribution detection, we depart from small-scale settings and explore large-scale multiclass and multi-label settings with high-resolution images and thousands of classes. To make future work in real-world settings possible, we create new benchmarks for three large-scale settings. To test ImageNet multiclass anomaly detectors, we introduce the Species dataset containing over 700,000 images and over a thousand anomalous species. We leverage ImageNet-21K to evaluate PASCAL VOC and COCO multilabel anomaly detectors. Third, we introduce a new benchmark for anomaly segmentation by introducing a segmentation benchmark with road anomalies. We conduct extensive experiments in these more realistic settings for out-of-distribution detection and find that a surprisingly simple detector based on the maximum logit outperforms prior methods in all the large-scale multi-class, multi-label, and segmentation tasks, establishing a simple new baseline for future work.", "bibtex": "@InProceedings{pmlr-v162-hendrycks22a,\n title = \t {Scaling Out-of-Distribution Detection for Real-World Settings},\n author = {Hendrycks, Dan and Basart, Steven and Mazeika, Mantas and Zou, Andy and Kwon, Joseph and Mostajabi, Mohammadreza and Steinhardt, Jacob and Song, Dawn},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8759--8773},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hendrycks22a/hendrycks22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hendrycks22a.html},\n abstract = \t {Detecting out-of-distribution examples is important for safety-critical machine learning applications such as detecting novel biological phenomena and self-driving cars. However, existing research mainly focuses on simple small-scale settings. To set the stage for more realistic out-of-distribution detection, we depart from small-scale settings and explore large-scale multiclass and multi-label settings with high-resolution images and thousands of classes. To make future work in real-world settings possible, we create new benchmarks for three large-scale settings. To test ImageNet multiclass anomaly detectors, we introduce the Species dataset containing over 700,000 images and over a thousand anomalous species. We leverage ImageNet-21K to evaluate PASCAL VOC and COCO multilabel anomaly detectors. Third, we introduce a new benchmark for anomaly segmentation by introducing a segmentation benchmark with road anomalies. We conduct extensive experiments in these more realistic settings for out-of-distribution detection and find that a surprisingly simple detector based on the maximum logit outperforms prior methods in all the large-scale multi-class, multi-label, and segmentation tasks, establishing a simple new baseline for future work.}\n}", "pdf": "https://proceedings.mlr.press/v162/hendrycks22a/hendrycks22a.pdf", "supp": "", "pdf_size": 12103331, "gs_citation": 571, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8919172731066658800&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "UC Berkeley; UChicago; UIUC; UC Berkeley; Yale University; TTIC; UC Berkeley; UC Berkeley", "aff_domain": "berkeley.edu; ; ; ; ; ; ;berkeley.edu", "email": "berkeley.edu; ; ; ; ; ; ;berkeley.edu", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/hendrycks22a.html", "aff_unique_index": "0;1;2;0;3;4;0;0", "aff_unique_norm": "University of California, Berkeley;University of Chicago;University of Illinois Urbana-Champaign;Yale University;Toyota Technological Institute at Chicago", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.berkeley.edu;https://www.uchicago.edu;https://www illinois.edu;https://www.yale.edu;https://www.ttic.edu", "aff_unique_abbr": "UC Berkeley;UChicago;UIUC;Yale;TTIC", "aff_campus_unique_index": "0;2;0;3;0;0", "aff_campus_unique": "Berkeley;;Urbana-Champaign;Chicago", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Scaling Structured Inference with Randomization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16417", "id": "16417", "proceeding": "https://proceedings.mlr.press/v162/fu22a.html", "poster": "/media/PosterPDFs/ICML%202022/4da04049a062f5adfe81b67dd755cecc.png?t=1657842613.696524", "slides": "", "author_site": "Yao Fu, John Cunningham, Mirella Lapata", "author": "Yao Fu; John Cunningham; Mirella Lapata", "abstract": "Deep discrete structured models have seen considerable progress recently, but traditional inference using dynamic programming (DP) typically works with a small number of states (less than hundreds), which severely limits model capacity. At the same time, across machine learning, there is a recent trend of using randomized truncation techniques to accelerate computations involving large sums. Here, we propose a family of randomized dynamic programming (RDP) algorithms for scaling structured models to tens of thousands of latent states. Our method is widely applicable to classical DP-based inference (partition, marginal, reparameterization, entropy) and different graph structures (chains, trees, and more general hypergraphs). It is also compatible with automatic differentiation: it can be integrated with neural networks seamlessly and learned with gradient-based optimizers. Our core technique approximates the sum-product by restricting and reweighting DP on a small subset of nodes, which reduces computation by orders of magnitude. We further achieve low bias and variance via Rao-Blackwellization and importance sampling. Experiments over different graphs demonstrate the accuracy and efficiency of our approach. Furthermore, when using RDP for training a structured variational autoencoder with a scaled inference network, we achieve better test likelihood than baselines and successfully prevent posterior collapse.", "bibtex": "@InProceedings{pmlr-v162-fu22a,\n title = \t {Scaling Structured Inference with Randomization},\n author = {Fu, Yao and Cunningham, John and Lapata, Mirella},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6811--6828},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fu22a/fu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/fu22a.html},\n abstract = \t {Deep discrete structured models have seen considerable progress recently, but traditional inference using dynamic programming (DP) typically works with a small number of states (less than hundreds), which severely limits model capacity. At the same time, across machine learning, there is a recent trend of using randomized truncation techniques to accelerate computations involving large sums. Here, we propose a family of randomized dynamic programming (RDP) algorithms for scaling structured models to tens of thousands of latent states. Our method is widely applicable to classical DP-based inference (partition, marginal, reparameterization, entropy) and different graph structures (chains, trees, and more general hypergraphs). It is also compatible with automatic differentiation: it can be integrated with neural networks seamlessly and learned with gradient-based optimizers. Our core technique approximates the sum-product by restricting and reweighting DP on a small subset of nodes, which reduces computation by orders of magnitude. We further achieve low bias and variance via Rao-Blackwellization and importance sampling. Experiments over different graphs demonstrate the accuracy and efficiency of our approach. Furthermore, when using RDP for training a structured variational autoencoder with a scaled inference network, we achieve better test likelihood than baselines and successfully prevent posterior collapse.}\n}", "pdf": "https://proceedings.mlr.press/v162/fu22a/fu22a.pdf", "supp": "", "pdf_size": 1348821, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13234676438098295868&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "School of Informatics, University of Edinburgh; Statistics Department, Columbia University + Zuckerman Institute, Columbia University; School of Informatics, University of Edinburgh", "aff_domain": "ed.ac.uk;columbia.edu;inf.ed.ac.uk", "email": "ed.ac.uk;columbia.edu;inf.ed.ac.uk", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/fu22a.html", "aff_unique_index": "0;1+1;0", "aff_unique_norm": "University of Edinburgh;Columbia University", "aff_unique_dep": "School of Informatics;Statistics Department", "aff_unique_url": "https://www.ed.ac.uk;https://www.columbia.edu", "aff_unique_abbr": "Edinburgh;Columbia", "aff_campus_unique_index": "0;;0", "aff_campus_unique": "Edinburgh;", "aff_country_unique_index": "0;1+1;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Scaling-up Diverse Orthogonal Convolutional Networks by a Paraunitary Framework", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16575", "id": "16575", "proceeding": "https://proceedings.mlr.press/v162/su22a.html", "poster": "/media/PosterPDFs/ICML%202022/dab10c50dc668cd8560df444ff3a4227.png?t=1658009289.228991", "slides": "", "author_site": "Jiahao Su, Wonmin Byeon, Furong Huang", "author": "Jiahao Su; Wonmin Byeon; Furong Huang", "abstract": "Enforcing orthogonality in convolutional neural networks is a remedy for gradient vanishing/exploding problems and sensitivity to perturbation. Many previous approaches for orthogonal convolutions enforce orthogonality on its flattened kernel, which, however, do not lead to the orthogonality of the operation. Some recent approaches consider orthogonality for standard convolutional layers and propose specific classes of their realizations. In this work, we propose a theoretical framework that establishes the equivalence between diverse orthogonal convolutional layers in the spatial domain and the paraunitary systems in the spectral domain. Since 1D paraunitary systems admit a complete factorization, we can parameterize any separable orthogonal convolution as a composition of spatial filters. As a result, our framework endows high expressive power to various convolutional layers while maintaining their exact orthogonality. Furthermore, our layers are memory and computationally efficient for deep networks compared to previous designs. Our versatile framework, for the first time, enables the study of architectural designs for deep orthogonal networks, such as choices of skip connection, initialization, stride, and dilation. Consequently, we scale up orthogonal networks to deep architectures, including ResNet and ShuffleNet, substantially outperforming their shallower counterparts. Finally, we show how to construct residual flows, a flow-based generative model that requires strict Lipschitzness, using our orthogonal networks. Our code will be publicly available at https://github.com/umd-huang-lab/ortho-conv", "bibtex": "@InProceedings{pmlr-v162-su22a,\n title = \t {Scaling-up Diverse Orthogonal Convolutional Networks by a Paraunitary Framework},\n author = {Su, Jiahao and Byeon, Wonmin and Huang, Furong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20546--20579},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/su22a/su22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/su22a.html},\n abstract = \t {Enforcing orthogonality in convolutional neural networks is a remedy for gradient vanishing/exploding problems and sensitivity to perturbation. Many previous approaches for orthogonal convolutions enforce orthogonality on its flattened kernel, which, however, do not lead to the orthogonality of the operation. Some recent approaches consider orthogonality for standard convolutional layers and propose specific classes of their realizations. In this work, we propose a theoretical framework that establishes the equivalence between diverse orthogonal convolutional layers in the spatial domain and the paraunitary systems in the spectral domain. Since 1D paraunitary systems admit a complete factorization, we can parameterize any separable orthogonal convolution as a composition of spatial filters. As a result, our framework endows high expressive power to various convolutional layers while maintaining their exact orthogonality. Furthermore, our layers are memory and computationally efficient for deep networks compared to previous designs. Our versatile framework, for the first time, enables the study of architectural designs for deep orthogonal networks, such as choices of skip connection, initialization, stride, and dilation. Consequently, we scale up orthogonal networks to deep architectures, including ResNet and ShuffleNet, substantially outperforming their shallower counterparts. Finally, we show how to construct residual flows, a flow-based generative model that requires strict Lipschitzness, using our orthogonal networks. Our code will be publicly available at https://github.com/umd-huang-lab/ortho-conv}\n}", "pdf": "https://proceedings.mlr.press/v162/su22a/su22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/su22a-supp.zip", "pdf_size": 963234, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3882925949659737411&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "University of Maryland, College Park, MD USA; NVIDIA Research, NVIDIA Corporation, Santa Clara, CA USA; University of Maryland, College Park, MD USA", "aff_domain": "umd.edu; ;umd.edu", "email": "umd.edu; ;umd.edu", "github": "https://github.com/umd-huang-lab/ortho-conv", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/su22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Maryland;NVIDIA", "aff_unique_dep": ";NVIDIA Research", "aff_unique_url": "https://www/umd.edu;https://www.nvidia.com", "aff_unique_abbr": "UMD;NVIDIA", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "College Park;Santa Clara", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Score Matching Enables Causal Discovery of Nonlinear Additive Noise Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16279", "id": "16279", "proceeding": "https://proceedings.mlr.press/v162/rolland22a.html", "poster": "/media/PosterPDFs/ICML%202022/4f714c73db5191f3a71a380cba8843ed.png?t=1657875975.6354542", "slides": "", "author_site": "Paul Rolland, Volkan Cevher, Matth\u00e4us Kleindessner, Chris Russell, Dominik Janzing, Bernhard Sch\u00f6lkopf, Francesco Locatello", "author": "Paul Rolland; Volkan Cevher; Matth\u00e4us Kleindessner; Chris Russell; Dominik Janzing; Bernhard Sch\u00f6lkopf; Francesco Locatello", "abstract": "This paper demonstrates how to recover causal graphs from the score of the data distribution in non-linear additive (Gaussian) noise models. Using score matching algorithms as a building block, we show how to design a new generation of scalable causal discovery methods. To showcase our approach, we also propose a new efficient method for approximating the score\u2019s Jacobian, enabling to recover the causal graph. Empirically, we find that the new algorithm, called SCORE, is competitive with state-of-the-art causal discovery methods while being significantly faster.", "bibtex": "@InProceedings{pmlr-v162-rolland22a,\n title = \t {Score Matching Enables Causal Discovery of Nonlinear Additive Noise Models},\n author = {Rolland, Paul and Cevher, Volkan and Kleindessner, Matth{\\\"a}us and Russell, Chris and Janzing, Dominik and Sch{\\\"o}lkopf, Bernhard and Locatello, Francesco},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18741--18753},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rolland22a/rolland22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rolland22a.html},\n abstract = \t {This paper demonstrates how to recover causal graphs from the score of the data distribution in non-linear additive (Gaussian) noise models. Using score matching algorithms as a building block, we show how to design a new generation of scalable causal discovery methods. To showcase our approach, we also propose a new efficient method for approximating the score\u2019s Jacobian, enabling to recover the causal graph. Empirically, we find that the new algorithm, called SCORE, is competitive with state-of-the-art causal discovery methods while being significantly faster.}\n}", "pdf": "https://proceedings.mlr.press/v162/rolland22a/rolland22a.pdf", "supp": "", "pdf_size": 302531, "gs_citation": 103, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6048242509247224536&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Ecole Polytechnique F \u00b4ed\u00b4eral de Lausanne, Lausanne, Switzerland+Amazon, Tuebingen, Germany; Ecole Polytechnique F \u00b4ed\u00b4eral de Lausanne, Lausanne, Switzerland+Amazon, Tuebingen, Germany; Amazon, Tuebingen, Germany; Amazon, Tuebingen, Germany; Amazon, Tuebingen, Germany; Amazon, Tuebingen, Germany; Amazon, Tuebingen, Germany", "aff_domain": "ep\ufb02.ch; ; ; ; ; ;amazon.com", "email": "ep\ufb02.ch; ; ; ; ; ;amazon.com", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/rolland22a.html", "aff_unique_index": "0+1;0+1;1;1;1;1;1", "aff_unique_norm": "EPFL;Amazon", "aff_unique_dep": ";Amazon", "aff_unique_url": "https://www.epfl.ch;https://www.amazon.de", "aff_unique_abbr": "EPFL;", "aff_campus_unique_index": "0+1;0+1;1;1;1;1;1", "aff_campus_unique": "Lausanne;Tuebingen", "aff_country_unique_index": "0+1;0+1;1;1;1;1;1", "aff_country_unique": "Switzerland;Germany" }, { "title": "Score-Guided Intermediate Level Optimization: Fast Langevin Mixing for Inverse Problems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18401", "id": "18401", "proceeding": "https://proceedings.mlr.press/v162/daras22a.html", "poster": "", "slides": "", "author_site": "Giannis Daras, Yuval Dagan, Alexandros Dimakis, Constantinos Daskalakis", "author": "Giannis Daras; Yuval Dagan; Alex Dimakis; Constantinos Daskalakis", "abstract": "We prove fast mixing and characterize the stationary distribution of the Langevin Algorithm for inverting random weighted DNN generators. This result extends the work of Hand and Voroninski from efficient inversion to efficient posterior sampling. In practice, to allow for increased expressivity, we propose to do posterior sampling in the latent space of a pre-trained generative model. To achieve that, we train a score-based model in the latent space of a StyleGAN-2 and we use it to solve inverse problems. Our framework, Score-Guided Intermediate Layer Optimization (SGILO), extends prior work by replacing the sparsity regularization with a generative prior in the intermediate layer. Experimentally, we obtain significant improvements over the previous state-of-the-art, especially in the low measurement regime.", "bibtex": "@InProceedings{pmlr-v162-daras22a,\n title = \t {Score-Guided Intermediate Level Optimization: Fast {L}angevin Mixing for Inverse Problems},\n author = {Daras, Giannis and Dagan, Yuval and Dimakis, Alex and Daskalakis, Constantinos},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4722--4753},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/daras22a/daras22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/daras22a.html},\n abstract = \t {We prove fast mixing and characterize the stationary distribution of the Langevin Algorithm for inverting random weighted DNN generators. This result extends the work of Hand and Voroninski from efficient inversion to efficient posterior sampling. In practice, to allow for increased expressivity, we propose to do posterior sampling in the latent space of a pre-trained generative model. To achieve that, we train a score-based model in the latent space of a StyleGAN-2 and we use it to solve inverse problems. Our framework, Score-Guided Intermediate Layer Optimization (SGILO), extends prior work by replacing the sparsity regularization with a generative prior in the intermediate layer. Experimentally, we obtain significant improvements over the previous state-of-the-art, especially in the low measurement regime.}\n}", "pdf": "https://proceedings.mlr.press/v162/daras22a/daras22a.pdf", "supp": "", "pdf_size": 13713979, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2907628937367446227&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, University of Texas at Austin; MIT CSAIL; Department of Electrical and Computer Engineering, University of Texas at Austin; MIT CSAIL", "aff_domain": "utexas.edu;mit.edu;austin.utexas.edu;csail.mit.edu", "email": "utexas.edu;mit.edu;austin.utexas.edu;csail.mit.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/daras22a.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of Texas at Austin;Massachusetts Institute of Technology", "aff_unique_dep": "Department of Computer Science;Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.utexas.edu;https://www.csail.mit.edu", "aff_unique_abbr": "UT Austin;MIT CSAIL", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "Austin;Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Score-based Generative Modeling of Graphs via the System of Stochastic Differential Equations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18139", "id": "18139", "proceeding": "https://proceedings.mlr.press/v162/jo22a.html", "poster": "/media/PosterPDFs/ICML%202022/cf67355a3333e6e143439161adc2d82e_ZedDzrM.png?t=1657343841.3097463", "slides": "", "author_site": "Jaehyeong Jo, Seul Lee, Sung Ju Hwang", "author": "Jaehyeong Jo; Seul Lee; Sung Ju Hwang", "abstract": "Generating graph-structured data requires learning the underlying distribution of graphs. Yet, this is a challenging problem, and the previous graph generative methods either fail to capture the permutation-invariance property of graphs or cannot sufficiently model the complex dependency between nodes and edges, which is crucial for generating real-world graphs such as molecules. To overcome such limitations, we propose a novel score-based generative model for graphs with a continuous-time framework. Specifically, we propose a new graph diffusion process that models the joint distribution of the nodes and edges through a system of stochastic differential equations (SDEs). Then, we derive novel score matching objectives tailored for the proposed diffusion process to estimate the gradient of the joint log-density with respect to each component, and introduce a new solver for the system of SDEs to efficiently sample from the reverse diffusion process. We validate our graph generation method on diverse datasets, on which it either achieves significantly superior or competitive performance to the baselines. Further analysis shows that our method is able to generate molecules that lie close to the training distribution yet do not violate the chemical valency rule, demonstrating the effectiveness of the system of SDEs in modeling the node-edge relationships.", "bibtex": "@InProceedings{pmlr-v162-jo22a,\n title = \t {Score-based Generative Modeling of Graphs via the System of Stochastic Differential Equations},\n author = {Jo, Jaehyeong and Lee, Seul and Hwang, Sung Ju},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10362--10383},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jo22a/jo22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jo22a.html},\n abstract = \t {Generating graph-structured data requires learning the underlying distribution of graphs. Yet, this is a challenging problem, and the previous graph generative methods either fail to capture the permutation-invariance property of graphs or cannot sufficiently model the complex dependency between nodes and edges, which is crucial for generating real-world graphs such as molecules. To overcome such limitations, we propose a novel score-based generative model for graphs with a continuous-time framework. Specifically, we propose a new graph diffusion process that models the joint distribution of the nodes and edges through a system of stochastic differential equations (SDEs). Then, we derive novel score matching objectives tailored for the proposed diffusion process to estimate the gradient of the joint log-density with respect to each component, and introduce a new solver for the system of SDEs to efficiently sample from the reverse diffusion process. We validate our graph generation method on diverse datasets, on which it either achieves significantly superior or competitive performance to the baselines. Further analysis shows that our method is able to generate molecules that lie close to the training distribution yet do not violate the chemical valency rule, demonstrating the effectiveness of the system of SDEs in modeling the node-edge relationships.}\n}", "pdf": "https://proceedings.mlr.press/v162/jo22a/jo22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/jo22a-supp.zip", "pdf_size": 6248876, "gs_citation": 281, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4163972994004543532&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Korea Advanced Institute of Science and Technology (KAIST), Seoul, South Korea+AITRICS, South Korea; Korea Advanced Institute of Science and Technology (KAIST), Seoul, South Korea+AITRICS, South Korea; Korea Advanced Institute of Science and Technology (KAIST), Seoul, South Korea+AITRICS, South Korea", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "github": "https://github.com/harryjo97/GDSS", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/jo22a.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;AITRICS", "aff_unique_dep": ";", "aff_unique_url": "https://www.kaist.ac.kr;", "aff_unique_abbr": "KAIST;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "South Korea" }, { "title": "Searching for BurgerFormer with Micro-Meso-Macro Space Design", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17979", "id": "17979", "proceeding": "https://proceedings.mlr.press/v162/yang22f.html", "poster": "/media/PosterPDFs/ICML%202022/00e26af6ac3b1c1c49d7c3d79c60d000.png?t=1657801702.0259974", "slides": "", "author_site": "Longxing Yang, Yu Hu, Shun Lu, Zihao Sun, Jilin Mei, Yinhe Han, Xiaowei Li", "author": "Longxing Yang; Yu Hu; Shun Lu; Zihao Sun; Jilin Mei; Yinhe Han; Xiaowei Li", "abstract": "With the success of Transformers in the computer vision field, the automated design of vision Transformers has attracted significant attention. Recently, MetaFormer found that simple average pooling can achieve impressive performance, which naturally raises the question of how to design a search space to search diverse and high-performance Transformer-like architectures. By revisiting typical search spaces, we design micro-meso-macro space to search for Transformer-like architectures, namely BurgerFormer. Micro, meso, and macro correspond to the granularity levels of operation, block and stage, respectively. At the microscopic level, we enrich the atomic operations to include various normalizations, activation functions, and basic operations (e.g., multi-head self attention, average pooling). At the mesoscopic level, a hamburger structure is searched out as the basic BurgerFormer block. At the macroscopic level, we search for the depth, width, and expansion ratio of the network based on the multi-stage architecture. Meanwhile, we propose a hybrid sampling method for effectively training the supernet. Experimental results demonstrate that the searched BurgerFormer architectures achieve comparable even superior performance compared with current state-of-the-art Transformers on the ImageNet and COCO datasets. The codes can be available at https://github.com/xingxing-123/BurgerFormer.", "bibtex": "@InProceedings{pmlr-v162-yang22f,\n title = \t {Searching for {B}urger{F}ormer with Micro-Meso-Macro Space Design},\n author = {Yang, Longxing and Hu, Yu and Lu, Shun and Sun, Zihao and Mei, Jilin and Han, Yinhe and Li, Xiaowei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25055--25069},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22f/yang22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22f.html},\n abstract = \t {With the success of Transformers in the computer vision field, the automated design of vision Transformers has attracted significant attention. Recently, MetaFormer found that simple average pooling can achieve impressive performance, which naturally raises the question of how to design a search space to search diverse and high-performance Transformer-like architectures. By revisiting typical search spaces, we design micro-meso-macro space to search for Transformer-like architectures, namely BurgerFormer. Micro, meso, and macro correspond to the granularity levels of operation, block and stage, respectively. At the microscopic level, we enrich the atomic operations to include various normalizations, activation functions, and basic operations (e.g., multi-head self attention, average pooling). At the mesoscopic level, a hamburger structure is searched out as the basic BurgerFormer block. At the macroscopic level, we search for the depth, width, and expansion ratio of the network based on the multi-stage architecture. Meanwhile, we propose a hybrid sampling method for effectively training the supernet. Experimental results demonstrate that the searched BurgerFormer architectures achieve comparable even superior performance compared with current state-of-the-art Transformers on the ImageNet and COCO datasets. The codes can be available at https://github.com/xingxing-123/BurgerFormer.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22f/yang22f.pdf", "supp": "", "pdf_size": 3435982, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12176664028124812702&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "https://github.com/xingxing-123/BurgerFormer", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/yang22f.html" }, { "title": "Secure Distributed Training at Scale", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17141", "id": "17141", "proceeding": "https://proceedings.mlr.press/v162/gorbunov22a.html", "poster": "/media/PosterPDFs/ICML%202022/65699726a3c601b9f31bf04019c8593c.png?t=1656054448.271129", "slides": "/media/icml-2022/Slides/17141.pdf", "author_site": "Eduard Gorbunov, Alexander Borzunov, Michael Diskin, Max Ryabinin", "author": "Eduard Gorbunov; Alexander Borzunov; Michael Diskin; Max Ryabinin", "abstract": "Many areas of deep learning benefit from using increasingly larger neural networks trained on public data, as is the case for pre-trained models for NLP and computer vision. Training such models requires a lot of computational resources (e.g., HPC clusters) that are not available to small research groups and independent researchers. One way to address it is for several smaller groups to pool their computational resources together and train a model that benefits all participants. Unfortunately, in this case, any participant can jeopardize the entire training run by sending incorrect updates, deliberately or by mistake. Training in presence of such peers requires specialized distributed training algorithms with Byzantine tolerance. These algorithms often sacrifice efficiency by introducing redundant communication or passing all updates through a trusted server, making it infeasible to apply them to large-scale deep learning, where models can have billions of parameters. In this work, we propose a novel protocol for secure (Byzantine-tolerant) decentralized training that emphasizes communication efficiency.", "bibtex": "@InProceedings{pmlr-v162-gorbunov22a,\n title = \t {Secure Distributed Training at Scale},\n author = {Gorbunov, Eduard and Borzunov, Alexander and Diskin, Michael and Ryabinin, Max},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7679--7739},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gorbunov22a/gorbunov22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gorbunov22a.html},\n abstract = \t {Many areas of deep learning benefit from using increasingly larger neural networks trained on public data, as is the case for pre-trained models for NLP and computer vision. Training such models requires a lot of computational resources (e.g., HPC clusters) that are not available to small research groups and independent researchers. One way to address it is for several smaller groups to pool their computational resources together and train a model that benefits all participants. Unfortunately, in this case, any participant can jeopardize the entire training run by sending incorrect updates, deliberately or by mistake. Training in presence of such peers requires specialized distributed training algorithms with Byzantine tolerance. These algorithms often sacrifice efficiency by introducing redundant communication or passing all updates through a trusted server, making it infeasible to apply them to large-scale deep learning, where models can have billions of parameters. In this work, we propose a novel protocol for secure (Byzantine-tolerant) decentralized training that emphasizes communication efficiency.}\n}", "pdf": "https://proceedings.mlr.press/v162/gorbunov22a/gorbunov22a.pdf", "supp": "", "pdf_size": 1437115, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4900391981513067748&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "MIPT+Mila \u2013 Quebec AI Institute+Yandex; HSE University+Yandex; HSE University+Yandex; HSE University+Yandex", "aff_domain": "phystech.edu;gmail.com; ; ", "email": "phystech.edu;gmail.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/gorbunov22a.html", "aff_unique_index": "0+1+2;3+2;3+2;3+2", "aff_unique_norm": "Moscow Institute of Physics and Technology;Quebec AI Institute;Yandex;Higher School of Economics", "aff_unique_dep": ";AI;;", "aff_unique_url": "https://mipt.ru;https://mila.quebec;https://yandex.com;https://hse.ru", "aff_unique_abbr": "MIPT;Mila;Yandex;HSE", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1+0;0+0;0+0;0+0", "aff_country_unique": "Russian Federation;Canada" }, { "title": "Secure Quantized Training for Deep Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17361", "id": "17361", "proceeding": "https://proceedings.mlr.press/v162/keller22a.html", "poster": "/media/PosterPDFs/ICML%202022/91bc333f6967019ac47b49ca0f2fa757.png?t=1656348659.122191", "slides": "/media/icml-2022/Slides/17361.pdf", "author_site": "Marcel Keller, Ke Sun", "author": "Marcel Keller; Ke Sun", "abstract": "We implement training of neural networks in secure multi-party computation (MPC) using quantization commonly used in said setting. We are the first to present an MNIST classifier purely trained in MPC that comes within 0.2 percent of the accuracy of the same convolutional neural network trained via plaintext computation. More concretely, we have trained a network with two convolutional and two dense layers to 99.2% accuracy in 3.5 hours (under one hour for 99% accuracy). We have also implemented AlexNet for CIFAR-10, which converges in a few hours. We develop novel protocols for exponentiation and inverse square root. Finally, we present experiments in a range of MPC security models for up to ten parties, both with honest and dishonest majority as well as semi-honest and malicious security.", "bibtex": "@InProceedings{pmlr-v162-keller22a,\n title = \t {Secure Quantized Training for Deep Learning},\n author = {Keller, Marcel and Sun, Ke},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10912--10938},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/keller22a/keller22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/keller22a.html},\n abstract = \t {We implement training of neural networks in secure multi-party computation (MPC) using quantization commonly used in said setting. We are the first to present an MNIST classifier purely trained in MPC that comes within 0.2 percent of the accuracy of the same convolutional neural network trained via plaintext computation. More concretely, we have trained a network with two convolutional and two dense layers to 99.2% accuracy in 3.5 hours (under one hour for 99% accuracy). We have also implemented AlexNet for CIFAR-10, which converges in a few hours. We develop novel protocols for exponentiation and inverse square root. Finally, we present experiments in a range of MPC security models for up to ten parties, both with honest and dishonest majority as well as semi-honest and malicious security.}\n}", "pdf": "https://proceedings.mlr.press/v162/keller22a/keller22a.pdf", "supp": "", "pdf_size": 496296, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15154157227965198183&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "CSIRO\u2019s Data61, Sydney, Australia+The Australian National University; CSIRO\u2019s Data61, Sydney, Australia+The Australian National University", "aff_domain": "data61.csiro.au;data61.csiro.au", "email": "data61.csiro.au;data61.csiro.au", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/keller22a.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "CSIRO;Australian National University", "aff_unique_dep": "Data61;", "aff_unique_url": "https://www.csiro.au;https://www.anu.edu.au", "aff_unique_abbr": "CSIRO;ANU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Sydney;", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "Australia" }, { "title": "Selective Network Linearization for Efficient Private Inference", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16649", "id": "16649", "proceeding": "https://proceedings.mlr.press/v162/cho22a.html", "poster": "/media/PosterPDFs/ICML%202022/b17446af05919be6e83500be7f5df5c4.png?t=1657918898.4902544", "slides": "", "author_site": "Minsu Cho, Ameya Joshi, Brandon Reagen, Siddharth Garg, Chinmay Hegde", "author": "Minsu Cho; Ameya Joshi; Brandon Reagen; Siddharth Garg; Chinmay Hegde", "abstract": "Private inference (PI) enables inferences directly on cryptographically secure data. While promising to address many privacy issues, it has seen limited use due to extreme runtimes. Unlike plaintext inference, where latency is dominated by FLOPs, in PI non-linear functions (namely ReLU) are the bottleneck. Thus, practical PI demands novel ReLU-aware optimizations. To reduce PI latency we propose a gradient-based algorithm that selectively linearizes ReLUs while maintaining prediction accuracy. We evaluate our algorithm on several standard PI benchmarks. The results demonstrate up to $4.25%$ more accuracy (iso-ReLU count at 50K) or $2.2\\times$ less latency (iso-accuracy at 70%) than the current state of the art and advance the Pareto frontier across the latency-accuracy space. To complement empirical results, we present a \u201cno free lunch\" theorem that sheds light on how and when network linearization is possible while maintaining prediction accuracy.", "bibtex": "@InProceedings{pmlr-v162-cho22a,\n title = \t {Selective Network Linearization for Efficient Private Inference},\n author = {Cho, Minsu and Joshi, Ameya and Reagen, Brandon and Garg, Siddharth and Hegde, Chinmay},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3947--3961},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cho22a/cho22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/cho22a.html},\n abstract = \t {Private inference (PI) enables inferences directly on cryptographically secure data. While promising to address many privacy issues, it has seen limited use due to extreme runtimes. Unlike plaintext inference, where latency is dominated by FLOPs, in PI non-linear functions (namely ReLU) are the bottleneck. Thus, practical PI demands novel ReLU-aware optimizations. To reduce PI latency we propose a gradient-based algorithm that selectively linearizes ReLUs while maintaining prediction accuracy. We evaluate our algorithm on several standard PI benchmarks. The results demonstrate up to $4.25%$ more accuracy (iso-ReLU count at 50K) or $2.2\\times$ less latency (iso-accuracy at 70%) than the current state of the art and advance the Pareto frontier across the latency-accuracy space. To complement empirical results, we present a \u201cno free lunch\" theorem that sheds light on how and when network linearization is possible while maintaining prediction accuracy.}\n}", "pdf": "https://proceedings.mlr.press/v162/cho22a/cho22a.pdf", "supp": "", "pdf_size": 1082520, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14016452576504224756&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "New York University Tandon School of Engineering; New York University Tandon School of Engineering; New York University Tandon School of Engineering; New York University Tandon School of Engineering; New York University Tandon School of Engineering", "aff_domain": "nyu.edu; ; ; ; ", "email": "nyu.edu; ; ; ; ", "github": "https://github.com/NYU-DICE-Lab/selective_network_linearization", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/cho22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://engineering.nyu.edu", "aff_unique_abbr": "NYU Tandon", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Tandon School of Engineering", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Selective Regression under Fairness Criteria", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18233", "id": "18233", "proceeding": "https://proceedings.mlr.press/v162/shah22a.html", "poster": "/media/PosterPDFs/ICML%202022/310614fca8fb8e5491295336298c340f_o8yeElI.png?t=1657820995.3558269", "slides": "", "author_site": "Abhin Shah, Yuheng Bu, Joshua Lee, Subhro Das, Rameswar Panda, Prasanna Sattigeri, Gregory Wornell", "author": "Abhin Shah; Yuheng Bu; Joshua K Lee; Subhro Das; Rameswar Panda; Prasanna Sattigeri; Gregory W Wornell", "abstract": "Selective regression allows abstention from prediction if the confidence to make an accurate prediction is not sufficient. In general, by allowing a reject option, one expects the performance of a regression model to increase at the cost of reducing coverage (i.e., by predicting on fewer samples). However, as we show, in some cases, the performance of a minority subgroup can decrease while we reduce the coverage, and thus selective regression can magnify disparities between different sensitive subgroups. Motivated by these disparities, we propose new fairness criteria for selective regression requiring the performance of every subgroup to improve with a decrease in coverage. We prove that if a feature representation satisfies the", "bibtex": "@InProceedings{pmlr-v162-shah22a,\n title = \t {Selective Regression under Fairness Criteria},\n author = {Shah, Abhin and Bu, Yuheng and Lee, Joshua K and Das, Subhro and Panda, Rameswar and Sattigeri, Prasanna and Wornell, Gregory W},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19598--19615},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shah22a/shah22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/shah22a.html},\n abstract = \t {Selective regression allows abstention from prediction if the confidence to make an accurate prediction is not sufficient. In general, by allowing a reject option, one expects the performance of a regression model to increase at the cost of reducing coverage (i.e., by predicting on fewer samples). However, as we show, in some cases, the performance of a minority subgroup can decrease while we reduce the coverage, and thus selective regression can magnify disparities between different sensitive subgroups. Motivated by these disparities, we propose new fairness criteria for selective regression requiring the performance of every subgroup to improve with a decrease in coverage. We prove that if a feature representation satisfies the", "pdf": "https://proceedings.mlr.press/v162/shah22a/shah22a.pdf", "supp": "", "pdf_size": 553932, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11829060385063117064&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Massachusetts Institute of Technology; Massachusetts Institute of Technology; Snap + Massachusetts Institute of Technology; MIT-IBM Watson AI Lab, IBM Research; MIT-IBM Watson AI Lab, IBM Research; MIT-IBM Watson AI Lab, IBM Research; Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu; ; ; ; ; ", "email": "mit.edu;mit.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/shah22a.html", "aff_unique_index": "0;0;1+0;2;2;2;0", "aff_unique_norm": "Massachusetts Institute of Technology;Snap Inc.;IBM", "aff_unique_dep": ";;AI Lab", "aff_unique_url": "https://web.mit.edu;https://www.snap.com;https://www.ibmwatsonai.org/", "aff_unique_abbr": "MIT;Snap;MIT-IBM AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Self-Organized Polynomial-Time Coordination Graphs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17735", "id": "17735", "proceeding": "https://proceedings.mlr.press/v162/yang22a.html", "poster": "/media/PosterPDFs/ICML%202022/d87c68a56bc8eb803b44f25abb627786.png?t=1657718496.8594527", "slides": "", "author_site": "Qianlan Yang, Weijun Dong, Zhizhou Ren, Jianhao Wang, Tonghan Wang, Chongjie Zhang", "author": "Qianlan Yang; Weijun Dong; Zhizhou Ren; Jianhao Wang; Tonghan Wang; Chongjie Zhang", "abstract": "Coordination graph is a promising approach to model agent collaboration in multi-agent reinforcement learning. It conducts a graph-based value factorization and induces explicit coordination among agents to complete complicated tasks. However, one critical challenge in this paradigm is the complexity of greedy action selection with respect to the factorized values. It refers to the decentralized constraint optimization problem (DCOP), which and whose constant-ratio approximation are NP-hard problems. To bypass this systematic hardness, this paper proposes a novel method, named Self-Organized Polynomial-time Coordination Graphs (SOP-CG), which uses structured graph classes to guarantee the accuracy and the computational efficiency of collaborated action selection. SOP-CG employs dynamic graph topology to ensure sufficient value function expressiveness. The graph selection is unified into an end-to-end learning paradigm. In experiments, we show that our approach learns succinct and well-adapted graph topologies, induces effective coordination, and improves performance across a variety of cooperative multi-agent tasks.", "bibtex": "@InProceedings{pmlr-v162-yang22a,\n title = \t {Self-Organized Polynomial-Time Coordination Graphs},\n author = {Yang, Qianlan and Dong, Weijun and Ren, Zhizhou and Wang, Jianhao and Wang, Tonghan and Zhang, Chongjie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24963--24979},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22a/yang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22a.html},\n abstract = \t {Coordination graph is a promising approach to model agent collaboration in multi-agent reinforcement learning. It conducts a graph-based value factorization and induces explicit coordination among agents to complete complicated tasks. However, one critical challenge in this paradigm is the complexity of greedy action selection with respect to the factorized values. It refers to the decentralized constraint optimization problem (DCOP), which and whose constant-ratio approximation are NP-hard problems. To bypass this systematic hardness, this paper proposes a novel method, named Self-Organized Polynomial-time Coordination Graphs (SOP-CG), which uses structured graph classes to guarantee the accuracy and the computational efficiency of collaborated action selection. SOP-CG employs dynamic graph topology to ensure sufficient value function expressiveness. The graph selection is unified into an end-to-end learning paradigm. In experiments, we show that our approach learns succinct and well-adapted graph topologies, induces effective coordination, and improves performance across a variety of cooperative multi-agent tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22a/yang22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/yang22a-supp.zip", "pdf_size": 1594301, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10295867697115976866&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Institute for Interdisciplinary Information Sciences (IIIS), Tsinghua University; Institute for Interdisciplinary Information Sciences (IIIS), Tsinghua University; Department of Computer Science, University of Illinois at Urbana-Champaign; Institute for Interdisciplinary Information Sciences (IIIS), Tsinghua University; Harvard University; Institute for Interdisciplinary Information Sciences (IIIS), Tsinghua University", "aff_domain": "tsinghua.edu.cn; ; ; ; ;tsinghua.edu.cn", "email": "tsinghua.edu.cn; ; ; ; ;tsinghua.edu.cn", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/yang22a.html", "aff_unique_index": "0;0;1;0;2;0", "aff_unique_norm": "Tsinghua University;University of Illinois Urbana-Champaign;Harvard University", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences (IIIS);Department of Computer Science;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://illinois.edu;https://www.harvard.edu", "aff_unique_abbr": "Tsinghua;UIUC;Harvard", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;1;0;1;0", "aff_country_unique": "China;United States" }, { "title": "Self-Supervised Models of Audio Effectively Explain Human Cortical Responses to Speech", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18093", "id": "18093", "proceeding": "https://proceedings.mlr.press/v162/vaidya22a.html", "poster": "/media/PosterPDFs/ICML%202022/b9a8f4af85454f7c56c06f0a39e7ec23.png?t=1658155819.2355218", "slides": "/media/icml-2022/Slides/18093.pdf", "author_site": "Aditya Vaidya, Shailee Jain, Alexander Huth", "author": "Aditya R Vaidya; Shailee Jain; Alexander Huth", "abstract": "Self-supervised language models are very effective at predicting high-level cortical responses during language comprehension. However, the best current models of lower-level auditory processing in the human brain rely on either hand-constructed acoustic filters or representations from supervised audio neural networks. In this work, we capitalize on the progress of self-supervised speech representation learning (SSL) to create new state-of-the-art models of the human auditory system. Compared against acoustic baselines, phonemic features, and supervised models, representations from the middle layers of self-supervised models (APC, wav2vec, wav2vec 2.0, and HuBERT) consistently yield the best prediction performance for fMRI recordings within the auditory cortex (AC). Brain areas involved in low-level auditory processing exhibit a preference for earlier SSL model layers, whereas higher-level semantic areas prefer later layers. We show that these trends are due to the models\u2019 ability to encode information at multiple linguistic levels (acoustic, phonetic, and lexical) along their representation depth. Overall, these results show that self-supervised models effectively capture the hierarchy of information relevant to different stages of speech processing in human cortex.", "bibtex": "@InProceedings{pmlr-v162-vaidya22a,\n title = \t {Self-Supervised Models of Audio Effectively Explain Human Cortical Responses to Speech},\n author = {Vaidya, Aditya R and Jain, Shailee and Huth, Alexander},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21927--21944},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vaidya22a/vaidya22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/vaidya22a.html},\n abstract = \t {Self-supervised language models are very effective at predicting high-level cortical responses during language comprehension. However, the best current models of lower-level auditory processing in the human brain rely on either hand-constructed acoustic filters or representations from supervised audio neural networks. In this work, we capitalize on the progress of self-supervised speech representation learning (SSL) to create new state-of-the-art models of the human auditory system. Compared against acoustic baselines, phonemic features, and supervised models, representations from the middle layers of self-supervised models (APC, wav2vec, wav2vec 2.0, and HuBERT) consistently yield the best prediction performance for fMRI recordings within the auditory cortex (AC). Brain areas involved in low-level auditory processing exhibit a preference for earlier SSL model layers, whereas higher-level semantic areas prefer later layers. We show that these trends are due to the models\u2019 ability to encode information at multiple linguistic levels (acoustic, phonetic, and lexical) along their representation depth. Overall, these results show that self-supervised models effectively capture the hierarchy of information relevant to different stages of speech processing in human cortex.}\n}", "pdf": "https://proceedings.mlr.press/v162/vaidya22a/vaidya22a.pdf", "supp": "", "pdf_size": 9567287, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13307875906404938300&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science; Department of Computer Science; Department of Computer Science + Department of Neuroscience", "aff_domain": "utexas.edu; ;utexas.edu", "email": "utexas.edu; ;utexas.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/vaidya22a.html", "aff_unique_index": "0;0;0+1", "aff_unique_norm": "Unknown Institution;Neuroscience Department", "aff_unique_dep": "Department of Computer Science;Department of Neuroscience", "aff_unique_url": ";", "aff_unique_abbr": ";", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", "aff_country_unique": "" }, { "title": "Self-Supervised Representation Learning via Latent Graph Prediction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15995", "id": "15995", "proceeding": "https://proceedings.mlr.press/v162/xie22e.html", "poster": "/media/PosterPDFs/ICML%202022/04ecb1fa28506ccb6f72b12c0245ddbc_gnKnKp3.png?t=1657942706.511528", "slides": "", "author_site": "Yaochen Xie, Zhao Xu, Shuiwang Ji", "author": "Yaochen Xie; Zhao Xu; Shuiwang Ji", "abstract": "Self-supervised learning (SSL) of graph neural networks is emerging as a promising way of leveraging unlabeled data. Currently, most methods are based on contrastive learning adapted from the image domain, which requires view generation and a sufficient number of negative samples. In contrast, existing predictive models do not require negative sampling, but lack theoretical guidance on the design of pretext training tasks. In this work, we propose the LaGraph, a theoretically grounded predictive SSL framework based on latent graph prediction. Learning objectives of LaGraph are derived as self-supervised upper bounds to objectives for predicting unobserved latent graphs. In addition to its improved performance, LaGraph provides explanations for recent successes of predictive models that include invariance-based objectives. We provide theoretical analysis comparing LaGraph to related methods in different domains. Our experimental results demonstrate the superiority of LaGraph in performance and the robustness to decreasing of training sample size on both graph-level and node-level tasks.", "bibtex": "@InProceedings{pmlr-v162-xie22e,\n title = \t {Self-Supervised Representation Learning via Latent Graph Prediction},\n author = {Xie, Yaochen and Xu, Zhao and Ji, Shuiwang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24460--24477},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xie22e/xie22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/xie22e.html},\n abstract = \t {Self-supervised learning (SSL) of graph neural networks is emerging as a promising way of leveraging unlabeled data. Currently, most methods are based on contrastive learning adapted from the image domain, which requires view generation and a sufficient number of negative samples. In contrast, existing predictive models do not require negative sampling, but lack theoretical guidance on the design of pretext training tasks. In this work, we propose the LaGraph, a theoretically grounded predictive SSL framework based on latent graph prediction. Learning objectives of LaGraph are derived as self-supervised upper bounds to objectives for predicting unobserved latent graphs. In addition to its improved performance, LaGraph provides explanations for recent successes of predictive models that include invariance-based objectives. We provide theoretical analysis comparing LaGraph to related methods in different domains. Our experimental results demonstrate the superiority of LaGraph in performance and the robustness to decreasing of training sample size on both graph-level and node-level tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/xie22e/xie22e.pdf", "supp": "", "pdf_size": 1868426, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15436923059083544697&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science & Engineering, Texas A&M University; Department of Computer Science & Engineering, Texas A&M University; Department of Computer Science & Engineering, Texas A&M University", "aff_domain": "tamu.edu; ;tamu.edu", "email": "tamu.edu; ;tamu.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/xie22e.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "Department of Computer Science & Engineering", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Self-conditioning Pre-Trained Language Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18083", "id": "18083", "proceeding": "https://proceedings.mlr.press/v162/cuadros22a.html", "poster": "/media/PosterPDFs/ICML%202022/e5ae7b1f180083e8a49e55e4d488bbec.png?t=1657287882.015127", "slides": "", "author_site": "Xavier Suau, Luca Zappella, Nicholas Apostoloff", "author": "Xavier Suau Cuadros; Luca Zappella; Nicholas Apostoloff", "abstract": "In this paper we aim to investigate the mechanisms that guide text generation with pre-trained Transformer-based Language Models (TLMs). Grounded on the Product of Experts formulation by Hinton (1999), we describe a generative mechanism that exploits expert units which naturally exist in TLMs. Such units are responsible for detecting concepts in the input and conditioning text generation on such concepts. We describe how to identify expert units and how to activate them during inference in order to induce any desired concept in the generated output. We find that the activation of a surprisingly small amount of units is sufficient to steer text generation (as little as 3 units in a model with 345M parameters). While the objective of this work is to learn more about how TLMs work, we show that our method is effective for conditioning without fine-tuning or using extra parameters, even on fine-grained homograph concepts. Additionally, we show that our method can be used to correct gender bias present in the output of TLMs and achieves gender parity for all evaluated contexts. We compare our method with FUDGE and PPLM-BoW, and show that our approach is able to achieve gender parity at a lower perplexity and better Self-BLEU score. The proposed method is accessible to a wide audience thanks to its simplicity and minimal compute needs. The findings in this paper are a step forward in understanding the generative mechanisms of TLMs.", "bibtex": "@InProceedings{pmlr-v162-cuadros22a,\n title = \t {Self-conditioning Pre-Trained Language Models},\n author = {Cuadros, Xavier Suau and Zappella, Luca and Apostoloff, Nicholas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4455--4473},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cuadros22a/cuadros22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/cuadros22a.html},\n abstract = \t {In this paper we aim to investigate the mechanisms that guide text generation with pre-trained Transformer-based Language Models (TLMs). Grounded on the Product of Experts formulation by Hinton (1999), we describe a generative mechanism that exploits expert units which naturally exist in TLMs. Such units are responsible for detecting concepts in the input and conditioning text generation on such concepts. We describe how to identify expert units and how to activate them during inference in order to induce any desired concept in the generated output. We find that the activation of a surprisingly small amount of units is sufficient to steer text generation (as little as 3 units in a model with 345M parameters). While the objective of this work is to learn more about how TLMs work, we show that our method is effective for conditioning without fine-tuning or using extra parameters, even on fine-grained homograph concepts. Additionally, we show that our method can be used to correct gender bias present in the output of TLMs and achieves gender parity for all evaluated contexts. We compare our method with FUDGE and PPLM-BoW, and show that our approach is able to achieve gender parity at a lower perplexity and better Self-BLEU score. The proposed method is accessible to a wide audience thanks to its simplicity and minimal compute needs. The findings in this paper are a step forward in understanding the generative mechanisms of TLMs.}\n}", "pdf": "https://proceedings.mlr.press/v162/cuadros22a/cuadros22a.pdf", "supp": "", "pdf_size": 3887948, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3098558170082154964&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff": "Apple; Apple; Apple", "aff_domain": "apple.com; ; ", "email": "apple.com; ; ", "github": "https://github.com/apple/ml-selfcond", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/cuadros22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Apple", "aff_unique_dep": "Apple Inc.", "aff_unique_url": "https://www.apple.com", "aff_unique_abbr": "Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Self-supervised Models are Good Teaching Assistants for Vision Transformers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16185", "id": "16185", "proceeding": "https://proceedings.mlr.press/v162/wu22c.html", "poster": "/media/PosterPDFs/ICML%202022/3b3dbaf68507998acd6a5a5254ab2d76.png?t=1656756862.9706676", "slides": "", "author_site": "Haiyan Wu, Yuting Gao, Yinqi Zhang, Shaohui Lin, Yuan Xie, Xing Sun, Ke Li", "author": "Haiyan Wu; Yuting Gao; Yinqi Zhang; Shaohui Lin; Yuan Xie; Xing Sun; Ke Li", "abstract": "Transformers have shown remarkable progress on computer vision tasks in the past year. Compared to their CNN counterparts, transformers usually need the help of distillation to achieve comparable results on middle or small sized datasets. Meanwhile, recent researches discover that when transformers are trained with supervised and self-supervised manner respectively, the captured patterns are quite different both qualitatively and quantitatively. These findings motivate us to introduce an self-supervised teaching assistant (SSTA) besides the commonly used supervised teacher to improve the performance of transformers. Specifically, we propose a head-level knowledge distillation method that selects the most important head of the supervised teacher and self-supervised teaching assistant, and let the student mimic the attention distribution of these two heads, so as to make the student focus on the relationship between tokens deemed by the teacher and the teacher assistant. Extensive experiments verify the effectiveness of SSTA and demonstrate that the proposed SSTA is a good compensation to the supervised teacher. Meanwhile, some analytical experiments towards multiple perspectives (e.g. prediction, shape bias, robustness, and transferability to downstream tasks) with supervised teachers, self-supervised teaching assistants and students are inductive and may inspire future researches.", "bibtex": "@InProceedings{pmlr-v162-wu22c,\n title = \t {Self-supervised Models are Good Teaching Assistants for Vision Transformers},\n author = {Wu, Haiyan and Gao, Yuting and Zhang, Yinqi and Lin, Shaohui and Xie, Yuan and Sun, Xing and Li, Ke},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24031--24042},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22c/wu22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22c.html},\n abstract = \t {Transformers have shown remarkable progress on computer vision tasks in the past year. Compared to their CNN counterparts, transformers usually need the help of distillation to achieve comparable results on middle or small sized datasets. Meanwhile, recent researches discover that when transformers are trained with supervised and self-supervised manner respectively, the captured patterns are quite different both qualitatively and quantitatively. These findings motivate us to introduce an self-supervised teaching assistant (SSTA) besides the commonly used supervised teacher to improve the performance of transformers. Specifically, we propose a head-level knowledge distillation method that selects the most important head of the supervised teacher and self-supervised teaching assistant, and let the student mimic the attention distribution of these two heads, so as to make the student focus on the relationship between tokens deemed by the teacher and the teacher assistant. Extensive experiments verify the effectiveness of SSTA and demonstrate that the proposed SSTA is a good compensation to the supervised teacher. Meanwhile, some analytical experiments towards multiple perspectives (e.g. prediction, shape bias, robustness, and transferability to downstream tasks) with supervised teachers, self-supervised teaching assistants and students are inductive and may inspire future researches.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22c/wu22c.pdf", "supp": "", "pdf_size": 1617312, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10892407507586795078&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "School of Computer Science and Technology, East China Normal University, Shanghai, China+Tencent Youtu Lab, Shanghai, China; Tencent Youtu Lab, Shanghai, China; School of Computer Science and Technology, East China Normal University, Shanghai, China; School of Computer Science and Technology, East China Normal University, Shanghai, China; School of Computer Science and Technology, East China Normal University, Shanghai, China; Tencent Youtu Lab, Shanghai, China; Tencent Youtu Lab, Shanghai, China", "aff_domain": "cs.ecnu.edu.cn;cs.ecnu.edu.cn; ; ; ; ; ", "email": "cs.ecnu.edu.cn;cs.ecnu.edu.cn; ; ; ; ; ", "github": "https://github.com/GlassyWu/SSTA", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/wu22c.html", "aff_unique_index": "0+1;1;0;0;0;1;1", "aff_unique_norm": "East China Normal University;Tencent", "aff_unique_dep": "School of Computer Science and Technology;Youtu Lab", "aff_unique_url": "http://www.ecnu.edu.cn;https://www.tencent.com", "aff_unique_abbr": "ECNU;Tencent", "aff_campus_unique_index": "0+0;0;0;0;0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0+0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Self-supervised learning with random-projection quantizer for speech recognition", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17405", "id": "17405", "proceeding": "https://proceedings.mlr.press/v162/chiu22a.html", "poster": "/media/PosterPDFs/ICML%202022/65b0df23fd2d449ae1e4b2d27151d73b.png?t=1657317758.221762", "slides": "", "author_site": "Chung-Cheng Chiu, James Qin, Yu Zhang, Jiahui Yu, Yonghui Wu", "author": "Chung-Cheng Chiu; James Qin; Yu Zhang; Jiahui Yu; Yonghui Wu", "abstract": "We present a simple and effective self-supervised learning approach for speech recognition. The approach learns a model to predict the masked speech signals, in the form of discrete labels generated with a random-projection quantizer. In particular the quantizer projects speech inputs with a randomly initialized matrix, and does a nearest-neighbor lookup in a randomly-initialized codebook. Neither the matrix nor the codebook are updated during self-supervised learning. Since the random-projection quantizer is not trained and is separated from the speech recognition model, the design makes the approach flexible and is compatible with universal speech recognition architecture. On LibriSpeech our approach achieves similar word-error-rates as previous work using self-supervised learning with non-streaming models, and provides lower word-error-rates than previous work with streaming models. On multilingual tasks the approach also provides significant improvement over wav2vec 2.0 and w2v-BERT.", "bibtex": "@InProceedings{pmlr-v162-chiu22a,\n title = \t {Self-supervised learning with random-projection quantizer for speech recognition},\n author = {Chiu, Chung-Cheng and Qin, James and Zhang, Yu and Yu, Jiahui and Wu, Yonghui},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3915--3924},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chiu22a/chiu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chiu22a.html},\n abstract = \t {We present a simple and effective self-supervised learning approach for speech recognition. The approach learns a model to predict the masked speech signals, in the form of discrete labels generated with a random-projection quantizer. In particular the quantizer projects speech inputs with a randomly initialized matrix, and does a nearest-neighbor lookup in a randomly-initialized codebook. Neither the matrix nor the codebook are updated during self-supervised learning. Since the random-projection quantizer is not trained and is separated from the speech recognition model, the design makes the approach flexible and is compatible with universal speech recognition architecture. On LibriSpeech our approach achieves similar word-error-rates as previous work using self-supervised learning with non-streaming models, and provides lower word-error-rates than previous work with streaming models. On multilingual tasks the approach also provides significant improvement over wav2vec 2.0 and w2v-BERT.}\n}", "pdf": "https://proceedings.mlr.press/v162/chiu22a/chiu22a.pdf", "supp": "", "pdf_size": 642691, "gs_citation": 215, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7415500943435474289&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team", "aff_domain": "google.com;google.com;google.com; ; ", "email": "google.com;google.com;google.com; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/chiu22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Selling Data To a Machine Learner: Pricing via Costly Signaling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18029", "id": "18029", "proceeding": "https://proceedings.mlr.press/v162/chen22m.html", "poster": "/media/PosterPDFs/ICML%202022/4079016d940210b4ae9ae7d41c4a2065_Qc4FWTf.png?t=1657266679.5342", "slides": "", "author_site": "Junjie Chen, Minming Li, Haifeng Xu", "author": "Junjie Chen; Minming Li; Haifeng Xu", "abstract": "We consider a new problem of selling data to a machine learner who looks to purchase data to train his machine learning model. A key challenge in this setup is that neither the seller nor the machine learner knows the true quality of data. When designing a revenue-maximizing mechanism, a data seller faces the tradeoff between the cost and precision of data quality estimation. To address this challenge, we study a natural class of mechanisms that price data via costly signaling. Motivated by the assumption of i.i.d. data points as in classic machine learning models, we first consider selling homogeneous data and derive an optimal selling mechanism. We then turn to the sale of heterogeneous data, motivated by the sale of multiple data sets, and show that 1) on the negative side, it is NP-hard to approximate the optimal mechanism within a constant ratio e/(e+1) + o(1); while 2) on the positive side, there is a 1/k-approximate algorithm, where k is the number of the machine learner\u2019s private types.", "bibtex": "@InProceedings{pmlr-v162-chen22m,\n title = \t {Selling Data To a Machine Learner: Pricing via Costly Signaling},\n author = {Chen, Junjie and Li, Minming and Xu, Haifeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3336--3359},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22m/chen22m.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22m.html},\n abstract = \t {We consider a new problem of selling data to a machine learner who looks to purchase data to train his machine learning model. A key challenge in this setup is that neither the seller nor the machine learner knows the true quality of data. When designing a revenue-maximizing mechanism, a data seller faces the tradeoff between the cost and precision of data quality estimation. To address this challenge, we study a natural class of mechanisms that price data via costly signaling. Motivated by the assumption of i.i.d. data points as in classic machine learning models, we first consider selling homogeneous data and derive an optimal selling mechanism. We then turn to the sale of heterogeneous data, motivated by the sale of multiple data sets, and show that 1) on the negative side, it is NP-hard to approximate the optimal mechanism within a constant ratio e/(e+1) + o(1); while 2) on the positive side, there is a 1/k-approximate algorithm, where k is the number of the machine learner\u2019s private types.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22m/chen22m.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chen22m-supp.zip", "pdf_size": 897108, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7638864064087278077&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, City University of Hong Kong, Hong Kong, China+\u2217\u03b1-\u03b2order1; Department of Computer Science, City University of Hong Kong, Hong Kong, China+\u2217\u03b1-\u03b2order1; Department of Computer Science, University of Chicago, Chicago, Illinois, USA+\u2217\u03b1-\u03b2order2", "aff_domain": "my.cityu.edu.hk;cityu.edu.hk;uchicago.edu", "email": "my.cityu.edu.hk;cityu.edu.hk;uchicago.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22m.html", "aff_unique_index": "0;0;2", "aff_unique_norm": "City University of Hong Kong;;University of Chicago", "aff_unique_dep": "Department of Computer Science;;Department of Computer Science", "aff_unique_url": "https://www.cityu.edu.hk;;https://www.uchicago.edu", "aff_unique_abbr": "CityU;;UChicago", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Hong Kong;;Chicago", "aff_country_unique_index": "0;0;2", "aff_country_unique": "China;;United States" }, { "title": "Sequential Covariate Shift Detection Using Classifier Two-Sample Tests", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16271", "id": "16271", "proceeding": "https://proceedings.mlr.press/v162/jang22a.html", "poster": "/media/PosterPDFs/ICML%202022/7e3b7a5bafcb0fa8e8dfe3ea6aca9186.png?t=1657827828.622849", "slides": "", "author_site": "Sooyong Jang, Sangdon Park, Insup Lee, Osbert Bastani", "author": "Sooyong Jang; Sangdon Park; Insup Lee; Osbert Bastani", "abstract": "A standard assumption in supervised learning is that the training data and test data are from the same distribution. However, this assumption often fails to hold in practice, which can cause the learned model to perform poorly. We consider the problem of detecting covariate shift, where the covariate distribution shifts but the conditional distribution of labels given covariates remains the same. This problem can naturally be solved using a two-sample test{\u2014}i.e., test whether the current test distribution of covariates equals the training distribution of covariates. Our algorithm builds on classifier tests, which train a discriminator to distinguish train and test covariates, and then use the accuracy of this discriminator as a test statistic. A key challenge is that classifier tests assume given a fixed set of test covariates. In practice, test covariates often arrive sequentially over time{\u2014}e.g., a self-driving car observes a stream of images while driving. Furthermore, covariate shift can occur multiple times{\u2014}i.e., shift and then shift back later or gradually shift over time. To address these challenges, our algorithm trains the discriminator online. Additionally, it evaluates test accuracy using each new covariate before taking a gradient step; this strategy avoids constructing a held-out test set, which can improve sample efficiency. We prove that this optimization preserves the correctness{\u2014}i.e., our algorithm achieves a desired bound on the false positive rate. In our experiments, we show that our algorithm efficiently detects covariate shifts on multiple datasets{\u2014}ImageNet, IWildCam, and Py150.", "bibtex": "@InProceedings{pmlr-v162-jang22a,\n title = \t {Sequential Covariate Shift Detection Using Classifier Two-Sample Tests},\n author = {Jang, Sooyong and Park, Sangdon and Lee, Insup and Bastani, Osbert},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9845--9880},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jang22a/jang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jang22a.html},\n abstract = \t {A standard assumption in supervised learning is that the training data and test data are from the same distribution. However, this assumption often fails to hold in practice, which can cause the learned model to perform poorly. We consider the problem of detecting covariate shift, where the covariate distribution shifts but the conditional distribution of labels given covariates remains the same. This problem can naturally be solved using a two-sample test{\u2014}i.e., test whether the current test distribution of covariates equals the training distribution of covariates. Our algorithm builds on classifier tests, which train a discriminator to distinguish train and test covariates, and then use the accuracy of this discriminator as a test statistic. A key challenge is that classifier tests assume given a fixed set of test covariates. In practice, test covariates often arrive sequentially over time{\u2014}e.g., a self-driving car observes a stream of images while driving. Furthermore, covariate shift can occur multiple times{\u2014}i.e., shift and then shift back later or gradually shift over time. To address these challenges, our algorithm trains the discriminator online. Additionally, it evaluates test accuracy using each new covariate before taking a gradient step; this strategy avoids constructing a held-out test set, which can improve sample efficiency. We prove that this optimization preserves the correctness{\u2014}i.e., our algorithm achieves a desired bound on the false positive rate. In our experiments, we show that our algorithm efficiently detects covariate shifts on multiple datasets{\u2014}ImageNet, IWildCam, and Py150.}\n}", "pdf": "https://proceedings.mlr.press/v162/jang22a/jang22a.pdf", "supp": "", "pdf_size": 6815443, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17014091212404389073&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "PRECISE Center, University of Pennsylvania, USA; PRECISE Center, University of Pennsylvania, USA + School of Cybersecurity and Privacy, Georgia Institute of Technology, USA; PRECISE Center, University of Pennsylvania, USA; PRECISE Center, University of Pennsylvania, USA", "aff_domain": "seas.upenn.edu; ; ; ", "email": "seas.upenn.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/jang22a.html", "aff_unique_index": "0;0+1;0;0", "aff_unique_norm": "University of Pennsylvania;Georgia Institute of Technology", "aff_unique_dep": "PRECISE Center;School of Cybersecurity and Privacy", "aff_unique_url": "https://www.upenn.edu;https://www.gatech.edu", "aff_unique_abbr": "UPenn;Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "Sequential and Parallel Constrained Max-value Entropy Search via Information Lower Bound", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17935", "id": "17935", "proceeding": "https://proceedings.mlr.press/v162/takeno22a.html", "poster": "/media/PosterPDFs/ICML%202022/e7d4c8d4fe04d9b4539a075d809c6d01.png?t=1657785003.9705334", "slides": "", "author_site": "Shion Takeno, Tomoyuki Tamura, Kazuki Shitara, Masayuki Karasuyama", "author": "Shion Takeno; Tomoyuki Tamura; Kazuki Shitara; Masayuki Karasuyama", "abstract": "Max-value entropy search (MES) is one of the state-of-the-art approaches in Bayesian optimization (BO). In this paper, we propose a novel variant of MES for constrained problems, called Constrained MES via Information lower BOund (CMES-IBO), that is based on a Monte Carlo (MC) estimator of a lower bound of a mutual information (MI). Unlike existing studies, our MI is defined so that uncertainty with respect to feasibility can be incorporated. We derive a lower bound of the MI that guarantees non-negativity, while a constrained counterpart of conventional MES can be negative. We further provide theoretical analysis that assures the low-variability of our estimator which has never been investigated for any existing information-theoretic BO. Moreover, using the conditional MI, we extend CMES-IBO to the parallel setting while maintaining the desirable properties. We demonstrate the effectiveness of CMES-IBO by several benchmark functions and real-world problems.", "bibtex": "@InProceedings{pmlr-v162-takeno22a,\n title = \t {Sequential and Parallel Constrained Max-value Entropy Search via Information Lower Bound},\n author = {Takeno, Shion and Tamura, Tomoyuki and Shitara, Kazuki and Karasuyama, Masayuki},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20960--20986},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/takeno22a/takeno22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/takeno22a.html},\n abstract = \t {Max-value entropy search (MES) is one of the state-of-the-art approaches in Bayesian optimization (BO). In this paper, we propose a novel variant of MES for constrained problems, called Constrained MES via Information lower BOund (CMES-IBO), that is based on a Monte Carlo (MC) estimator of a lower bound of a mutual information (MI). Unlike existing studies, our MI is defined so that uncertainty with respect to feasibility can be incorporated. We derive a lower bound of the MI that guarantees non-negativity, while a constrained counterpart of conventional MES can be negative. We further provide theoretical analysis that assures the low-variability of our estimator which has never been investigated for any existing information-theoretic BO. Moreover, using the conditional MI, we extend CMES-IBO to the parallel setting while maintaining the desirable properties. We demonstrate the effectiveness of CMES-IBO by several benchmark functions and real-world problems.}\n}", "pdf": "https://proceedings.mlr.press/v162/takeno22a/takeno22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/takeno22a-supp.zip", "pdf_size": 2536349, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16036837653776960409&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science, Nagoya Institute of Technology, Aichi, Japan + Center for Advanced Intelligence Project, RIKEN, Tokyo, Japan; Department of Physical Science and Engineering, Nagoya Institute of Technology, Aichi, Japan; Joining and Welding Research Institute, Osaka University, Osaka, Japan + Nanostructures Research Laboratory, Japan Fine Ceramics Center, Aichi, Japan; Department of Computer Science, Nagoya Institute of Technology, Aichi, Japan", "aff_domain": "nitech.ac.jp;nitech.ac.jp;nitech.ac.jp;nitech.ac.jp", "email": "nitech.ac.jp;nitech.ac.jp;nitech.ac.jp;nitech.ac.jp", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/takeno22a.html", "aff_unique_index": "0+1;0;2+3;0", "aff_unique_norm": "Nagoya Institute of Technology;RIKEN;Osaka University;Japan Fine Ceramics Center", "aff_unique_dep": "Department of Computer Science;Center for Advanced Intelligence Project;Joining and Welding Research Institute;Nanostructures Research Laboratory", "aff_unique_url": "https://www.nitech.ac.jp;https://www.riken.jp;https://www.osaka-u.ac.jp;https://www.jfcc.or.jp", "aff_unique_abbr": "NIT;RIKEN;OU;JFCC", "aff_campus_unique_index": "0+1;3+4;0", "aff_campus_unique": "Nagoya;Tokyo;;Osaka;Aichi", "aff_country_unique_index": "0+0;0;0+0;0", "aff_country_unique": "Japan" }, { "title": "Set Based Stochastic Subsampling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17893", "id": "17893", "proceeding": "https://proceedings.mlr.press/v162/andreis22a.html", "poster": "/media/PosterPDFs/ICML%202022/452bf208bf901322968557227b8f6efe.png?t=1657770472.7847097", "slides": "", "author_site": "Bruno Andreis, Seanie Lee, A. Tuan Nguyen, Juho Lee, Eunho Yang, Sung Ju Hwang", "author": "Bruno Andreis; Seanie Lee; A. Tuan Nguyen; Juho Lee; Eunho Yang; Sung Ju Hwang", "abstract": "Deep models are designed to operate on huge volumes of high dimensional data such as images. In order to reduce the volume of data these models must process, we propose a set-based two-stage end-to-end neural subsampling model that is jointly optimized with an", "bibtex": "@InProceedings{pmlr-v162-andreis22a,\n title = \t {Set Based Stochastic Subsampling},\n author = {Andreis, Bruno and Lee, Seanie and Nguyen, A. Tuan and Lee, Juho and Yang, Eunho and Hwang, Sung Ju},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {619--638},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/andreis22a/andreis22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/andreis22a.html},\n abstract = \t {Deep models are designed to operate on huge volumes of high dimensional data such as images. In order to reduce the volume of data these models must process, we propose a set-based two-stage end-to-end neural subsampling model that is jointly optimized with an", "pdf": "https://proceedings.mlr.press/v162/andreis22a/andreis22a.pdf", "supp": "", "pdf_size": 9803021, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yYiVTnEYPN8J:scholar.google.com/&scioq=Set+Based+Stochastic+Subsampling&hl=en&as_sdt=0,33", "gs_version_total": 9, "aff": "Graduate School of AI, Korea Advanced Institute of Science and Technology (KAIST), Seoul, South Korea; Graduate School of AI, Korea Advanced Institute of Science and Technology (KAIST), Seoul, South Korea; University of Oxford, Oxford, United Kingdom; Graduate School of AI, Korea Advanced Institute of Science and Technology (KAIST), Seoul, South Korea+AITRICS, Seoul, South Korea; Graduate School of AI, Korea Advanced Institute of Science and Technology (KAIST), Seoul, South Korea+AITRICS, Seoul, South Korea; Graduate School of AI, Korea Advanced Institute of Science and Technology (KAIST), Seoul, South Korea+AITRICS, Seoul, South Korea", "aff_domain": "kaist.ac.kr; ; ; ; ;kaist.ac.kr", "email": "kaist.ac.kr; ; ; ; ;kaist.ac.kr", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/andreis22a.html", "aff_unique_index": "0;0;1;0+2;0+2;0+2", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Oxford;AITRICS", "aff_unique_dep": "Graduate School of AI;;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.ox.ac.uk;", "aff_unique_abbr": "KAIST;Oxford;", "aff_campus_unique_index": "0;0;1;0+0;0+0;0+0", "aff_campus_unique": "Seoul;Oxford", "aff_country_unique_index": "0;0;1;0+0;0+0;0+0", "aff_country_unique": "South Korea;United Kingdom" }, { "title": "Set Norm and Equivariant Skip Connections: Putting the Deep in Deep Sets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16893", "id": "16893", "proceeding": "https://proceedings.mlr.press/v162/zhang22ac.html", "poster": "/media/PosterPDFs/ICML%202022/bd0cc810b580b35884bd9df37c0e8b0f.png?t=1657674864.5807889", "slides": "", "author_site": "Lily Zhang, Veronica Tozzo, John Higgins, Rajesh Ranganath", "author": "Lily Zhang; Veronica Tozzo; John Higgins; Rajesh Ranganath", "abstract": "Permutation invariant neural networks are a promising tool for predictive modeling of set data. We show, however, that existing architectures struggle to perform well when they are deep. In this work, we mathematically and empirically analyze normalization layers and residual connections in the context of deep permutation invariant neural networks. We develop set norm, a normalization tailored for sets, and introduce the \u201cclean path principle\u201d for equivariant residual connections alongside a novel benefit of such connections, the reduction of information loss. Based on our analysis, we propose Deep Sets++ and Set Transformer++, deep models that reach comparable or better performance than their original counterparts on a diverse suite of tasks. We additionally introduce Flow-RBC, a new single-cell dataset and real-world application of permutation invariant prediction. We open-source our data and code here: https://github.com/rajesh-lab/deep_permutation_invariant.", "bibtex": "@InProceedings{pmlr-v162-zhang22ac,\n title = \t {Set Norm and Equivariant Skip Connections: Putting the Deep in Deep Sets},\n author = {Zhang, Lily and Tozzo, Veronica and Higgins, John and Ranganath, Rajesh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26559--26574},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22ac/zhang22ac.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22ac.html},\n abstract = \t {Permutation invariant neural networks are a promising tool for predictive modeling of set data. We show, however, that existing architectures struggle to perform well when they are deep. In this work, we mathematically and empirically analyze normalization layers and residual connections in the context of deep permutation invariant neural networks. We develop set norm, a normalization tailored for sets, and introduce the \u201cclean path principle\u201d for equivariant residual connections alongside a novel benefit of such connections, the reduction of information loss. Based on our analysis, we propose Deep Sets++ and Set Transformer++, deep models that reach comparable or better performance than their original counterparts on a diverse suite of tasks. We additionally introduce Flow-RBC, a new single-cell dataset and real-world application of permutation invariant prediction. We open-source our data and code here: https://github.com/rajesh-lab/deep_permutation_invariant.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22ac/zhang22ac.pdf", "supp": "", "pdf_size": 757726, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8359318767015654610&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Center for Data Science, New York University, New York, NY+Department of Computer Science, New York University, New York, NY; Center for Data Science, New York University, New York, NY+Massachusetts General Hospital, Harvard Medical School, Cambridge, MA+Department of Systems Biology, Harvard Medical School, Boston, MA; Massachusetts General Hospital, Harvard Medical School, Cambridge, MA+Department of Systems Biology, Harvard Medical School, Boston, MA; Center for Data Science, New York University, New York, NY+Department of Computer Science, New York University, New York, NY", "aff_domain": "nyu.edu;mgh.harvard.edu; ;cs.nyu.edu", "email": "nyu.edu;mgh.harvard.edu; ;cs.nyu.edu", "github": "https://github.com/rajesh-lab/deep-permutation-invariant", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhang22ac.html", "aff_unique_index": "0+0;0+1+2;1+2;0+0", "aff_unique_norm": "New York University;Massachusetts General Hospital;Harvard Medical School", "aff_unique_dep": "Center for Data Science;Harvard Medical School;Department of Systems Biology", "aff_unique_url": "https://www.nyu.edu;https://www.massgeneral.org;https://hms.harvard.edu", "aff_unique_abbr": "NYU;MGH;HMS", "aff_campus_unique_index": "0+0;0+1+2;1+2;0+0", "aff_campus_unique": "New York;Cambridge;Boston", "aff_country_unique_index": "0+0;0+0+0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "Sharp-MAML: Sharpness-Aware Model-Agnostic Meta Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16661", "id": "16661", "proceeding": "https://proceedings.mlr.press/v162/abbas22b.html", "poster": "/media/PosterPDFs/ICML%202022/3fc0a5dc1f5757c71b88be8adbfd10e9_SQpxOei.png?t=1657810895.1891732", "slides": "", "author_site": "Momin Abbas, Quan Xiao, Lisha Chen, Pin-Yu Chen, Tianyi Chen", "author": "Momin Abbas; Quan Xiao; Lisha Chen; Pin-Yu Chen; Tianyi Chen", "abstract": "Model-agnostic meta learning (MAML) is currently one of the dominating approaches for few-shot meta-learning. Albeit its effectiveness, the optimization of MAML can be challenging due to the innate bilevel problem structure. Specifically, the loss landscape of MAML is much more complex with possibly more saddle points and local minimizers than its empirical risk minimization counterpart. To address this challenge, we leverage the recently invented sharpness-aware minimization and develop a sharpness-aware MAML approach that we term Sharp-MAML. We empirically demonstrate that Sharp-MAML and its computation-efficient variant can outperform the plain-vanilla MAML baseline (e.g., +3% accuracy on Mini-Imagenet). We complement the empirical study with the convergence rate analysis and the generalization bound of Sharp-MAML. To the best of our knowledge, this is the first empirical and theoretical study on sharpness-aware minimization in the context of bilevel learning.", "bibtex": "@InProceedings{pmlr-v162-abbas22b,\n title = \t {Sharp-{MAML}: Sharpness-Aware Model-Agnostic Meta Learning},\n author = {Abbas, Momin and Xiao, Quan and Chen, Lisha and Chen, Pin-Yu and Chen, Tianyi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10--32},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/abbas22b/abbas22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/abbas22b.html},\n abstract = \t {Model-agnostic meta learning (MAML) is currently one of the dominating approaches for few-shot meta-learning. Albeit its effectiveness, the optimization of MAML can be challenging due to the innate bilevel problem structure. Specifically, the loss landscape of MAML is much more complex with possibly more saddle points and local minimizers than its empirical risk minimization counterpart. To address this challenge, we leverage the recently invented sharpness-aware minimization and develop a sharpness-aware MAML approach that we term Sharp-MAML. We empirically demonstrate that Sharp-MAML and its computation-efficient variant can outperform the plain-vanilla MAML baseline (e.g., +3% accuracy on Mini-Imagenet). We complement the empirical study with the convergence rate analysis and the generalization bound of Sharp-MAML. To the best of our knowledge, this is the first empirical and theoretical study on sharpness-aware minimization in the context of bilevel learning.}\n}", "pdf": "https://proceedings.mlr.press/v162/abbas22b/abbas22b.pdf", "supp": "", "pdf_size": 1855454, "gs_citation": 103, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14950420836477699137&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Rensselaer Polytechnic Institute, Troy, NY; Rensselaer Polytechnic Institute, Troy, NY; Rensselaer Polytechnic Institute, Troy, NY; IBM Thomas J. Watson Research Center, NY, USA; Rensselaer Polytechnic Institute, Troy, NY", "aff_domain": "rpi.edu;rpi.edu;rpi.edu;us.ibm.com;gmail.com", "email": "rpi.edu;rpi.edu;rpi.edu;us.ibm.com;gmail.com", "github": "https://github.com/mominabbass/Sharp-MAML", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/abbas22b.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Rensselaer Polytechnic Institute;IBM", "aff_unique_dep": ";IBM Thomas J. Watson Research Center", "aff_unique_url": "https://www.rpi.edu;https://www.ibm.com/research/watson", "aff_unique_abbr": "RPI;IBM Watson", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Troy;Yorktown Heights", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sharpened Quasi-Newton Methods: Faster Superlinear Rate and Larger Local Convergence Neighborhood", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16723", "id": "16723", "proceeding": "https://proceedings.mlr.press/v162/jin22b.html", "poster": "/media/PosterPDFs/ICML%202022/9ab8a8a9349eb1dd73ce155ce64c80fa.png?t=1657899601.3805852", "slides": "", "author_site": "Qiujiang Jin, Alec Koppel, Ketan Rajawat, Aryan Mokhtari", "author": "Qiujiang Jin; Alec Koppel; Ketan Rajawat; Aryan Mokhtari", "abstract": "Non-asymptotic analysis of quasi-Newton methods have received a lot of attention recently. In particular, several works have established a non-asymptotic superlinear rate of $$\\mathcal{O}((1/\\sqrt{t})^t)$$ for the (classic) BFGS method by exploiting the fact that its error of Newton direction approximation approaches zero. Moreover, a greedy variant of the BFGS method was recently proposed which accelerates the convergence of BFGS by directly approximating the Hessian matrix, instead of Newton direction, and achieves a fast local quadratic convergence rate. Alas, the local quadratic convergence of Greedy-BFGS requires way more updates compared to the number of iterations that BFGS requires for a local superlinear rate. This is due to the fact that in Greedy-BFGS the Hessian is directly approximated and the Newton direction approximation may not be as accurate as the one for BFGS. In this paper, we close this gap and present a novel BFGS method that has the best of two worlds. More precisely, it leverages the approximation ideas of both BFGS and Greedy-BFGS to properly approximate both the Newton direction and the Hessian matrix. Our theoretical results show that our method out-performs both BFGS and Greedy-BFGS in terms of convergence rate, while it reaches its quadratic convergence rate with fewer steps compared to Greedy-BFGS. Numerical experiments on various datasets also confirm our theoretical findings.", "bibtex": "@InProceedings{pmlr-v162-jin22b,\n title = \t {Sharpened Quasi-{N}ewton Methods: Faster Superlinear Rate and Larger Local Convergence Neighborhood},\n author = {Jin, Qiujiang and Koppel, Alec and Rajawat, Ketan and Mokhtari, Aryan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10228--10250},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jin22b/jin22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/jin22b.html},\n abstract = \t {Non-asymptotic analysis of quasi-Newton methods have received a lot of attention recently. In particular, several works have established a non-asymptotic superlinear rate of $$\\mathcal{O}((1/\\sqrt{t})^t)$$ for the (classic) BFGS method by exploiting the fact that its error of Newton direction approximation approaches zero. Moreover, a greedy variant of the BFGS method was recently proposed which accelerates the convergence of BFGS by directly approximating the Hessian matrix, instead of Newton direction, and achieves a fast local quadratic convergence rate. Alas, the local quadratic convergence of Greedy-BFGS requires way more updates compared to the number of iterations that BFGS requires for a local superlinear rate. This is due to the fact that in Greedy-BFGS the Hessian is directly approximated and the Newton direction approximation may not be as accurate as the one for BFGS. In this paper, we close this gap and present a novel BFGS method that has the best of two worlds. More precisely, it leverages the approximation ideas of both BFGS and Greedy-BFGS to properly approximate both the Newton direction and the Hessian matrix. Our theoretical results show that our method out-performs both BFGS and Greedy-BFGS in terms of convergence rate, while it reaches its quadratic convergence rate with fewer steps compared to Greedy-BFGS. Numerical experiments on various datasets also confirm our theoretical findings.}\n}", "pdf": "https://proceedings.mlr.press/v162/jin22b/jin22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/jin22b-supp.zip", "pdf_size": 949215, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14977866324032999705&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Electrical and Computer Engineering, The University of Texas at Austin, Austin, TX, USA; Amazon, Bellevue, WA, USA; Department of Electrical Engineering, Indian Institute of Technology Kanpur, Kanpur, UP, INDIA; Department of Electrical and Computer Engineering, The University of Texas at Austin, Austin, TX, USA", "aff_domain": "austin.utexas.edu; ; ; ", "email": "austin.utexas.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/jin22b.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Texas at Austin;Amazon;Indian Institute of Technology Kanpur", "aff_unique_dep": "Department of Electrical and Computer Engineering;Amazon.com, Inc.;Department of Electrical Engineering", "aff_unique_url": "https://www.utexas.edu;https://www.amazon.com;https://www.iitk.ac.in", "aff_unique_abbr": "UT Austin;Amazon;IIT Kanpur", "aff_campus_unique_index": "0;1;2;0", "aff_campus_unique": "Austin;Bellevue;Kanpur", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;India" }, { "title": "ShiftAddNAS: Hardware-Inspired Search for More Accurate and Efficient Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16349", "id": "16349", "proceeding": "https://proceedings.mlr.press/v162/you22a.html", "poster": "", "slides": "", "author_site": "Haoran You, Baopu Li, Shi Huihong, Yonggan Fu, Yingyan Lin", "author": "Haoran You; Baopu Li; Shi Huihong; Yonggan Fu; Yingyan Lin", "abstract": "Neural networks (NNs) with intensive multiplications (e.g., convolutions and transformers) are powerful yet power hungry, impeding their more extensive deployment into resource-constrained edge devices. As such, multiplication-free networks, which follow a common practice in energy-efficient hardware implementation to parameterize NNs with more efficient operators (e.g., bitwise shifts and additions), have gained growing attention. However, multiplication-free networks in general under-perform their vanilla counterparts in terms of the achieved accuracy. To this end, this work advocates hybrid NNs that consist of both powerful yet costly multiplications and efficient yet less powerful operators for marrying the best of both worlds, and proposes ShiftAddNAS, which can automatically search for more accurate and more efficient NNs. Our ShiftAddNAS highlights two enablers. Specifically, it integrates (1) the first hybrid search space that incorporates both multiplication-based and multiplication-free operators for facilitating the development of both accurate and efficient hybrid NNs; and (2) a novel weight sharing strategy that enables effective weight sharing among different operators that follow heterogeneous distributions (e.g., Gaussian for convolutions vs. Laplacian for add operators) and simultaneously leads to a largely reduced supernet size and much better searched networks. Extensive experiments and ablation studies on various models, datasets, and tasks consistently validate the effectiveness of ShiftAddNAS, e.g., achieving up to a +7.7% higher accuracy or a +4.9 better BLEU score as compared to state-of-the-art expert-designed and neural architecture searched NNs, while leading to up to 93% or 69% energy and latency savings, respectively. Codes and pretrained models are available at https://github.com/RICE-EIC/ShiftAddNAS.", "bibtex": "@InProceedings{pmlr-v162-you22a,\n title = \t {{S}hift{A}dd{NAS}: Hardware-Inspired Search for More Accurate and Efficient Neural Networks},\n author = {You, Haoran and Li, Baopu and Huihong, Shi and Fu, Yonggan and Lin, Yingyan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25566--25580},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/you22a/you22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/you22a.html},\n abstract = \t {Neural networks (NNs) with intensive multiplications (e.g., convolutions and transformers) are powerful yet power hungry, impeding their more extensive deployment into resource-constrained edge devices. As such, multiplication-free networks, which follow a common practice in energy-efficient hardware implementation to parameterize NNs with more efficient operators (e.g., bitwise shifts and additions), have gained growing attention. However, multiplication-free networks in general under-perform their vanilla counterparts in terms of the achieved accuracy. To this end, this work advocates hybrid NNs that consist of both powerful yet costly multiplications and efficient yet less powerful operators for marrying the best of both worlds, and proposes ShiftAddNAS, which can automatically search for more accurate and more efficient NNs. Our ShiftAddNAS highlights two enablers. Specifically, it integrates (1) the first hybrid search space that incorporates both multiplication-based and multiplication-free operators for facilitating the development of both accurate and efficient hybrid NNs; and (2) a novel weight sharing strategy that enables effective weight sharing among different operators that follow heterogeneous distributions (e.g., Gaussian for convolutions vs. Laplacian for add operators) and simultaneously leads to a largely reduced supernet size and much better searched networks. Extensive experiments and ablation studies on various models, datasets, and tasks consistently validate the effectiveness of ShiftAddNAS, e.g., achieving up to a +7.7% higher accuracy or a +4.9 better BLEU score as compared to state-of-the-art expert-designed and neural architecture searched NNs, while leading to up to 93% or 69% energy and latency savings, respectively. Codes and pretrained models are available at https://github.com/RICE-EIC/ShiftAddNAS.}\n}", "pdf": "https://proceedings.mlr.press/v162/you22a/you22a.pdf", "supp": "", "pdf_size": 1206313, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17026416337828414455&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Rice University; Oracle Health and AI; Nanjing University; Rice University; Rice University", "aff_domain": "oracle.com;rice.edu; ; ; ", "email": "oracle.com;rice.edu; ; ; ", "github": "https://github.com/RICE-EIC/ShiftAddNAS", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/you22a.html", "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Rice University;Oracle Corporation;Nanjing University", "aff_unique_dep": ";Health and AI;", "aff_unique_url": "https://www.rice.edu;https://www.oracle.com/health/;https://www.nju.edu.cn", "aff_unique_abbr": "Rice;Oracle;Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;China" }, { "title": "Short-Term Plasticity Neurons Learning to Learn and Forget", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18309", "id": "18309", "proceeding": "https://proceedings.mlr.press/v162/rodriguez22b.html", "poster": "/media/PosterPDFs/ICML%202022/a2802cade04644083dcde1c8c483ed9a_ffbgBL0.png?t=1657797504.7412221", "slides": "/media/icml-2022/Slides/18309.pdf", "author_site": "Hector Garcia Rodriguez, Qinghai Guo, Timoleon Moraitis", "author": "Hector Garcia Rodriguez; Qinghai Guo; Timoleon Moraitis", "abstract": "Short-term plasticity (STP) is a mechanism that stores decaying memories in synapses of the cerebral cortex. In computing practice, STP has been used, but mostly in the niche of spiking neurons, even though theory predicts that it is the optimal solution to certain dynamic tasks. Here we present a new type of recurrent neural unit, the STP Neuron (STPN), which indeed turns out strikingly powerful. Its key mechanism is that synapses have a state, propagated through time by a self-recurrent connection-within-the-synapse. This formulation enables training the plasticity with backpropagation through time, resulting in a form of learning to learn and forget in the short term. The STPN outperforms all tested alternatives, i.e. RNNs, LSTMs, other models with fast weights, and differentiable plasticity. We confirm this in both supervised and reinforcement learning (RL), and in tasks such as Associative Retrieval, Maze Exploration, Atari video games, and MuJoCo robotics. Moreover, we calculate that, in neuromorphic or biological circuits, the STPN minimizes energy consumption across models, as it depresses individual synapses dynamically. Based on these, biological STP may have been a strong evolutionary attractor that maximizes both efficiency and computational power. The STPN now brings these neuromorphic advantages also to a broad spectrum of machine learning practice. Code is available in https://github.com/NeuromorphicComputing/stpn.", "bibtex": "@InProceedings{pmlr-v162-rodriguez22b,\n title = \t {Short-Term Plasticity Neurons Learning to Learn and Forget},\n author = {Rodriguez, Hector Garcia and Guo, Qinghai and Moraitis, Timoleon},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18704--18722},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rodriguez22b/rodriguez22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/rodriguez22b.html},\n abstract = \t {Short-term plasticity (STP) is a mechanism that stores decaying memories in synapses of the cerebral cortex. In computing practice, STP has been used, but mostly in the niche of spiking neurons, even though theory predicts that it is the optimal solution to certain dynamic tasks. Here we present a new type of recurrent neural unit, the STP Neuron (STPN), which indeed turns out strikingly powerful. Its key mechanism is that synapses have a state, propagated through time by a self-recurrent connection-within-the-synapse. This formulation enables training the plasticity with backpropagation through time, resulting in a form of learning to learn and forget in the short term. The STPN outperforms all tested alternatives, i.e. RNNs, LSTMs, other models with fast weights, and differentiable plasticity. We confirm this in both supervised and reinforcement learning (RL), and in tasks such as Associative Retrieval, Maze Exploration, Atari video games, and MuJoCo robotics. Moreover, we calculate that, in neuromorphic or biological circuits, the STPN minimizes energy consumption across models, as it depresses individual synapses dynamically. Based on these, biological STP may have been a strong evolutionary attractor that maximizes both efficiency and computational power. The STPN now brings these neuromorphic advantages also to a broad spectrum of machine learning practice. Code is available in https://github.com/NeuromorphicComputing/stpn.}\n}", "pdf": "https://proceedings.mlr.press/v162/rodriguez22b/rodriguez22b.pdf", "supp": "", "pdf_size": 4106904, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13353176637859953693&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Huawei Technologies \u2013 Zurich Research Center, Switzerland+University College London, United Kingdom; Advanced Computing & Storage Lab, Huawei Technologies, Shenzhen, China; Huawei Technologies \u2013 Zurich Research Center, Switzerland", "aff_domain": "huawei.com; ;huawei.com", "email": "huawei.com; ;huawei.com", "github": "https://github.com/NeuromorphicComputing/stpn", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/rodriguez22b.html", "aff_unique_index": "0+1;0;0", "aff_unique_norm": "Huawei;University College London", "aff_unique_dep": "Zurich Research Center;", "aff_unique_url": "https://www.huawei.com;https://www.ucl.ac.uk", "aff_unique_abbr": "Huawei;UCL", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Zurich;;Shenzhen", "aff_country_unique_index": "0+1;2;0", "aff_country_unique": "Switzerland;United Kingdom;China" }, { "title": "Showing Your Offline Reinforcement Learning Work: Online Evaluation Budget Matters", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17509", "id": "17509", "proceeding": "https://proceedings.mlr.press/v162/kurenkov22a.html", "poster": "/media/PosterPDFs/ICML%202022/08aee6276db142f4b8ac98fb8ee0ed1b.png?t=1657552480.4347138", "slides": "", "author_site": "Vladislav Kurenkov, Sergey Kolesnikov", "author": "Vladislav Kurenkov; Sergey Kolesnikov", "abstract": "In this work, we argue for the importance of an online evaluation budget for a reliable comparison of deep offline RL algorithms. First, we delineate that the online evaluation budget is problem-dependent, where some problems allow for less but others for more. And second, we demonstrate that the preference between algorithms is budget-dependent across a diverse range of decision-making domains such as Robotics, Finance, and Energy Management. Following the points above, we suggest reporting the performance of deep offline RL algorithms under varying online evaluation budgets. To facilitate this, we propose to use a reporting tool from the NLP field, Expected Validation Performance. This technique makes it possible to reliably estimate expected maximum performance under different budgets while not requiring any additional computation beyond hyperparameter search. By employing this tool, we also show that Behavioral Cloning is often more favorable to offline RL algorithms when working within a limited budget.", "bibtex": "@InProceedings{pmlr-v162-kurenkov22a,\n title = \t {Showing Your Offline Reinforcement Learning Work: Online Evaluation Budget Matters},\n author = {Kurenkov, Vladislav and Kolesnikov, Sergey},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11729--11752},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kurenkov22a/kurenkov22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kurenkov22a.html},\n abstract = \t {In this work, we argue for the importance of an online evaluation budget for a reliable comparison of deep offline RL algorithms. First, we delineate that the online evaluation budget is problem-dependent, where some problems allow for less but others for more. And second, we demonstrate that the preference between algorithms is budget-dependent across a diverse range of decision-making domains such as Robotics, Finance, and Energy Management. Following the points above, we suggest reporting the performance of deep offline RL algorithms under varying online evaluation budgets. To facilitate this, we propose to use a reporting tool from the NLP field, Expected Validation Performance. This technique makes it possible to reliably estimate expected maximum performance under different budgets while not requiring any additional computation beyond hyperparameter search. By employing this tool, we also show that Behavioral Cloning is often more favorable to offline RL algorithms when working within a limited budget.}\n}", "pdf": "https://proceedings.mlr.press/v162/kurenkov22a/kurenkov22a.pdf", "supp": "", "pdf_size": 2755629, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8365164032899047401&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Tinkoff, Moscow, Russia; Tinkoff, Moscow, Russia", "aff_domain": "tinkoff.ai; ", "email": "tinkoff.ai; ", "github": "https://github.com/tinkoff-ai/eop", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/kurenkov22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Tinkoff", "aff_unique_dep": "", "aff_unique_url": "https://www.tinkoff.ru", "aff_unique_abbr": "Tinkoff", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Moscow", "aff_country_unique_index": "0;0", "aff_country_unique": "Russian Federation" }, { "title": "Shuffle Private Linear Contextual Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16281", "id": "16281", "proceeding": "https://proceedings.mlr.press/v162/chowdhury22a.html", "poster": "/media/PosterPDFs/ICML%202022/0a17ad0fa0870b05f172deeb05efef8e_KmGnGZx.png?t=1657552050.7913504", "slides": "", "author_site": "Sayak Ray Chowdhury, Xingyu Zhou", "author": "Sayak Ray Chowdhury; Xingyu Zhou", "abstract": "Differential privacy (DP) has been recently introduced to linear contextual bandits to formally address the privacy concerns in its associated personalized services to participating users (e.g., recommendations). Prior work largely focus on two trust models of DP \u2013 the central model, where a central server is responsible for protecting users\u2019 sensitive data, and the (stronger) local model, where information needs to be protected directly on users\u2019 side. However, there remains a fundamental gap in the utility achieved by learning algorithms under these two privacy models, e.g., if all users are", "bibtex": "@InProceedings{pmlr-v162-chowdhury22a,\n title = \t {Shuffle Private Linear Contextual Bandits},\n author = {Chowdhury, Sayak Ray and Zhou, Xingyu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3984--4009},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chowdhury22a/chowdhury22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chowdhury22a.html},\n abstract = \t {Differential privacy (DP) has been recently introduced to linear contextual bandits to formally address the privacy concerns in its associated personalized services to participating users (e.g., recommendations). Prior work largely focus on two trust models of DP \u2013 the central model, where a central server is responsible for protecting users\u2019 sensitive data, and the (stronger) local model, where information needs to be protected directly on users\u2019 side. However, there remains a fundamental gap in the utility achieved by learning algorithms under these two privacy models, e.g., if all users are", "pdf": "https://proceedings.mlr.press/v162/chowdhury22a/chowdhury22a.pdf", "supp": "", "pdf_size": 9074587, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2781620231659156667&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Boston University, USA+Wayne State University, USA; Wayne State University, USA", "aff_domain": "bu.edu;wayne.edu", "email": "bu.edu;wayne.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/chowdhury22a.html", "aff_unique_index": "0+1;1", "aff_unique_norm": "Boston University;Wayne State University", "aff_unique_dep": ";", "aff_unique_url": "https://www.bu.edu;https://wayne.edu", "aff_unique_abbr": "BU;WSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0", "aff_country_unique": "United States" }, { "title": "Simple and near-optimal algorithms for hidden stratification and multi-group learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16079", "id": "16079", "proceeding": "https://proceedings.mlr.press/v162/tosh22a.html", "poster": "/media/PosterPDFs/ICML%202022/86e8f7ab32cfd12577bc2619bc635690.png?t=1657564669.933954", "slides": "", "author_site": "Christopher Tosh, Daniel Hsu", "author": "Christopher J Tosh; Daniel Hsu", "abstract": "Multi-group agnostic learning is a formal learning criterion that is concerned with the conditional risks of predictors within subgroups of a population. The criterion addresses recent practical concerns such as subgroup fairness and hidden stratification. This paper studies the structure of solutions to the multi-group learning problem, and provides simple and near-optimal algorithms for the learning problem.", "bibtex": "@InProceedings{pmlr-v162-tosh22a,\n title = \t {Simple and near-optimal algorithms for hidden stratification and multi-group learning},\n author = {Tosh, Christopher J and Hsu, Daniel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21633--21657},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tosh22a/tosh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/tosh22a.html},\n abstract = \t {Multi-group agnostic learning is a formal learning criterion that is concerned with the conditional risks of predictors within subgroups of a population. The criterion addresses recent practical concerns such as subgroup fairness and hidden stratification. This paper studies the structure of solutions to the multi-group learning problem, and provides simple and near-optimal algorithms for the learning problem.}\n}", "pdf": "https://proceedings.mlr.press/v162/tosh22a/tosh22a.pdf", "supp": "", "pdf_size": 358559, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2545814187495561522&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Memorial Sloan Kettering Cancer Center, New York, NY; Department of Computer Science, Columbia University, New York, NY", "aff_domain": "gmail.com;cs.columbia.edu", "email": "gmail.com;cs.columbia.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/tosh22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Memorial Sloan Kettering Cancer Center;Columbia University", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www.mskcc.org;https://www.columbia.edu", "aff_unique_abbr": "MSKCC;Columbia", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Simplex Neural Population Learning: Any-Mixture Bayes-Optimality in Symmetric Zero-sum Games", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18415", "id": "18415", "proceeding": "https://proceedings.mlr.press/v162/liu22h.html", "poster": "/media/PosterPDFs/ICML%202022/55563844bcd4bba067fe86ac1f008c7e.png?t=1657645099.6396945", "slides": "", "author_site": "Siqi Liu, Marc Lanctot, Luke Marris, Nicolas Heess", "author": "Siqi Liu; Marc Lanctot; Luke Marris; Nicolas Heess", "abstract": "Learning to play optimally against any mixture over a diverse set of strategies is of important practical interests in competitive games. In this paper, we propose simplex-NeuPL that satisfies two desiderata simultaneously: i) learning a population of strategically diverse basis policies, represented by a single conditional network; ii) using the same network, learn best-responses to any mixture over the simplex of basis policies. We show that the resulting conditional policies incorporate prior information about their opponents effectively, enabling near optimal returns against arbitrary mixture policies in a game with tractable best-responses. We verify that such policies behave Bayes-optimally under uncertainty and offer insights in using this flexibility at test time. Finally, we offer evidence that learning best-responses to any mixture policies is an effective auxiliary task for strategic exploration, which, by itself, can lead to more performant populations.", "bibtex": "@InProceedings{pmlr-v162-liu22h,\n title = \t {Simplex Neural Population Learning: Any-Mixture {B}ayes-Optimality in Symmetric Zero-sum Games},\n author = {Liu, Siqi and Lanctot, Marc and Marris, Luke and Heess, Nicolas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13793--13806},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22h/liu22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22h.html},\n abstract = \t {Learning to play optimally against any mixture over a diverse set of strategies is of important practical interests in competitive games. In this paper, we propose simplex-NeuPL that satisfies two desiderata simultaneously: i) learning a population of strategically diverse basis policies, represented by a single conditional network; ii) using the same network, learn best-responses to any mixture over the simplex of basis policies. We show that the resulting conditional policies incorporate prior information about their opponents effectively, enabling near optimal returns against arbitrary mixture policies in a game with tractable best-responses. We verify that such policies behave Bayes-optimally under uncertainty and offer insights in using this flexibility at test time. Finally, we offer evidence that learning best-responses to any mixture policies is an effective auxiliary task for strategic exploration, which, by itself, can lead to more performant populations.}\n}", "pdf": "https://proceedings.mlr.press/v162/liu22h/liu22h.pdf", "supp": "", "pdf_size": 1271457, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7870858366636615706&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "University College London, UK+DeepMind, UK; DeepMind, UK; University College London, UK+DeepMind, UK; DeepMind, UK", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/liu22h.html", "aff_unique_index": "0+1;1;0+1;1", "aff_unique_norm": "University College London;DeepMind", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucl.ac.uk;https://deepmind.com", "aff_unique_abbr": "UCL;DeepMind", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0;0", "aff_country_unique": "United Kingdom" }, { "title": "Simultaneous Graph Signal Clustering and Graph Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16475", "id": "16475", "proceeding": "https://proceedings.mlr.press/v162/karaaslanli22a.html", "poster": "/media/PosterPDFs/ICML%202022/a3fc981af450752046be179185ebc8b5.png?t=1657914492.7220633", "slides": "", "author_site": "Abdullah Karaaslanli, Selin Aviyente", "author": "Abdullah Karaaslanli; Selin Aviyente", "abstract": "Graph learning (GL) aims to infer the topology of an unknown graph from a set of observations on its nodes, i.e., graph signals. While most of the existing GL approaches focus on homogeneous datasets, in many real world applications, data is heterogeneous, where graph signals are clustered and each cluster is associated with a different graph. In this paper, we address the problem of learning multiple graphs from heterogeneous data by formulating an optimization problem for joint graph signal clustering and graph topology inference. In particular, our approach extends spectral clustering by partitioning the graph signals not only based on their pairwise similarities but also their smoothness with respect to the graphs associated with the clusters. The proposed method also learns the representative graph for each cluster using the smoothness of the graph signals with respect to the graph topology. The resulting optimization problem is solved with an efficient block-coordinate descent algorithm and results on simulated and real data indicate the effectiveness of the proposed method.", "bibtex": "@InProceedings{pmlr-v162-karaaslanli22a,\n title = \t {Simultaneous Graph Signal Clustering and Graph Learning},\n author = {Karaaslanli, Abdullah and Aviyente, Selin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10762--10772},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/karaaslanli22a/karaaslanli22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/karaaslanli22a.html},\n abstract = \t {Graph learning (GL) aims to infer the topology of an unknown graph from a set of observations on its nodes, i.e., graph signals. While most of the existing GL approaches focus on homogeneous datasets, in many real world applications, data is heterogeneous, where graph signals are clustered and each cluster is associated with a different graph. In this paper, we address the problem of learning multiple graphs from heterogeneous data by formulating an optimization problem for joint graph signal clustering and graph topology inference. In particular, our approach extends spectral clustering by partitioning the graph signals not only based on their pairwise similarities but also their smoothness with respect to the graphs associated with the clusters. The proposed method also learns the representative graph for each cluster using the smoothness of the graph signals with respect to the graph topology. The resulting optimization problem is solved with an efficient block-coordinate descent algorithm and results on simulated and real data indicate the effectiveness of the proposed method.}\n}", "pdf": "https://proceedings.mlr.press/v162/karaaslanli22a/karaaslanli22a.pdf", "supp": "", "pdf_size": 615494, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1160209730109310337&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Electrical and Computer Engineering, Michigan State University, East Lansing, MI, US; Department of Electrical and Computer Engineering, Michigan State University, East Lansing, MI, US", "aff_domain": "msu.edu; ", "email": "msu.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/karaaslanli22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Michigan State University", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.msu.edu", "aff_unique_abbr": "MSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "East Lansing", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Simultaneously Learning Stochastic and Adversarial Bandits with General Graph Feedback", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17905", "id": "17905", "proceeding": "https://proceedings.mlr.press/v162/kong22b.html", "poster": "/media/PosterPDFs/ICML%202022/10c272d06794d3e5785d5e7c5356e9ff.png?t=1657798642.3935168", "slides": "", "author_site": "Fang Kong, Yichi Zhou, Shuai Li", "author": "Fang Kong; Yichi Zhou; Shuai Li", "abstract": "The problem of online learning with graph feedback has been extensively studied in the literature due to its generality and potential to model various learning tasks. Existing works mainly study the adversarial and stochastic feedback separately. If the prior knowledge of the feedback mechanism is unavailable or wrong, such specially designed algorithms could suffer great loss. To avoid this problem, \\citet{erez2021towards} try to optimize for both environments. However, they assume the feedback graphs are undirected and each vertex has a self-loop, which compromises the generality of the framework and may not be satisfied in applications. With a general feedback graph, the observation of an arm may not be available when this arm is pulled, which makes the exploration more expensive and the algorithms more challenging to perform optimally in both environments. In this work, we overcome this difficulty by a new trade-off mechanism with a carefully-designed proportion for exploration and exploitation. We prove the proposed algorithm simultaneously achieves $\\mathrm{poly} \\log T$ regret in the stochastic setting and minimax-optimal regret of $\\tilde{O}(T^{2/3})$ in the adversarial setting where $T$ is the horizon and $\\tilde{O}$ hides parameters independent of $T$ as well as logarithmic terms. To our knowledge, this is the first best-of-both-worlds result for general feedback graphs.", "bibtex": "@InProceedings{pmlr-v162-kong22b,\n title = \t {Simultaneously Learning Stochastic and Adversarial Bandits with General Graph Feedback},\n author = {Kong, Fang and Zhou, Yichi and Li, Shuai},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11473--11482},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kong22b/kong22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/kong22b.html},\n abstract = \t {The problem of online learning with graph feedback has been extensively studied in the literature due to its generality and potential to model various learning tasks. Existing works mainly study the adversarial and stochastic feedback separately. If the prior knowledge of the feedback mechanism is unavailable or wrong, such specially designed algorithms could suffer great loss. To avoid this problem, \\citet{erez2021towards} try to optimize for both environments. However, they assume the feedback graphs are undirected and each vertex has a self-loop, which compromises the generality of the framework and may not be satisfied in applications. With a general feedback graph, the observation of an arm may not be available when this arm is pulled, which makes the exploration more expensive and the algorithms more challenging to perform optimally in both environments. In this work, we overcome this difficulty by a new trade-off mechanism with a carefully-designed proportion for exploration and exploitation. We prove the proposed algorithm simultaneously achieves $\\mathrm{poly} \\log T$ regret in the stochastic setting and minimax-optimal regret of $\\tilde{O}(T^{2/3})$ in the adversarial setting where $T$ is the horizon and $\\tilde{O}$ hides parameters independent of $T$ as well as logarithmic terms. To our knowledge, this is the first best-of-both-worlds result for general feedback graphs.}\n}", "pdf": "https://proceedings.mlr.press/v162/kong22b/kong22b.pdf", "supp": "", "pdf_size": 323424, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12167859172934321294&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "John Hopcroft Center for Computer Science, Shanghai Jiao Tong University, Shanghai, China; Microsoft Research Asia, Beijing, China; John Hopcroft Center for Computer Science, Shanghai Jiao Tong University, Shanghai, China", "aff_domain": "sjtu.edu.cn;microsoft.com;sjtu.edu.cn", "email": "sjtu.edu.cn;microsoft.com;sjtu.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kong22b.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft", "aff_unique_dep": "John Hopcroft Center for Computer Science;Research", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "SJTU;MSRA", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Shanghai;Beijing", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Sketching Algorithms and Lower Bounds for Ridge Regression", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16627", "id": "16627", "proceeding": "https://proceedings.mlr.press/v162/kacham22a.html", "poster": "/media/PosterPDFs/ICML%202022/0e915db6326b6fb6a3c56546980a8c93.png?t=1658345489.294208", "slides": "", "author_site": "Praneeth Kacham, David Woodruff", "author": "Praneeth Kacham; David Woodruff", "abstract": "We give a sketching-based iterative algorithm that computes a $1+\\varepsilon$ approximate solution for the ridge regression problem $\\min_x \\|Ax-b\\|_2^2 +\\lambda\\|x\\|_2^2$ where $A \\in R^{n \\times d}$ with $d \\ge n$. Our algorithm, for a constant number of iterations (requiring a constant number of passes over the input), improves upon earlier work (Chowdhury et al.) by requiring that the sketching matrix only has a weaker Approximate Matrix Multiplication (AMM) guarantee that depends on $\\varepsilon$, along with a constant subspace embedding guarantee. The earlier work instead requires that the sketching matrix has a subspace embedding guarantee that depends on $\\varepsilon$. For example, to produce a $1+\\varepsilon$ approximate solution in $1$ iteration, which requires $2$ passes over the input, our algorithm requires the OSNAP embedding to have $m= O(n\\sigma^2/\\lambda\\varepsilon)$ rows with a sparsity parameter $s = O(\\log(n))$, whereas the earlier algorithm of Chowdhury et al. with the same number of rows of OSNAP requires a sparsity $s = O(\\sqrt{\\sigma^2/\\lambda\\varepsilon} \\cdot \\log(n))$, where $\\sigma = \\opnorm{A}$ is the spectral norm of the matrix $A$. We also show that this algorithm can be used to give faster algorithms for kernel ridge regression. Finally, we show that the sketch size required for our algorithm is essentially optimal for a natural framework of algorithms for ridge regression by proving lower bounds on oblivious sketching matrices for AMM. The sketch size lower bounds for AMM may be of independent interest.", "bibtex": "@InProceedings{pmlr-v162-kacham22a,\n title = \t {Sketching Algorithms and Lower Bounds for Ridge Regression},\n author = {Kacham, Praneeth and Woodruff, David},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10539--10556},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kacham22a/kacham22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kacham22a.html},\n abstract = \t {We give a sketching-based iterative algorithm that computes a $1+\\varepsilon$ approximate solution for the ridge regression problem $\\min_x \\|Ax-b\\|_2^2 +\\lambda\\|x\\|_2^2$ where $A \\in R^{n \\times d}$ with $d \\ge n$. Our algorithm, for a constant number of iterations (requiring a constant number of passes over the input), improves upon earlier work (Chowdhury et al.) by requiring that the sketching matrix only has a weaker Approximate Matrix Multiplication (AMM) guarantee that depends on $\\varepsilon$, along with a constant subspace embedding guarantee. The earlier work instead requires that the sketching matrix has a subspace embedding guarantee that depends on $\\varepsilon$. For example, to produce a $1+\\varepsilon$ approximate solution in $1$ iteration, which requires $2$ passes over the input, our algorithm requires the OSNAP embedding to have $m= O(n\\sigma^2/\\lambda\\varepsilon)$ rows with a sparsity parameter $s = O(\\log(n))$, whereas the earlier algorithm of Chowdhury et al. with the same number of rows of OSNAP requires a sparsity $s = O(\\sqrt{\\sigma^2/\\lambda\\varepsilon} \\cdot \\log(n))$, where $\\sigma = \\opnorm{A}$ is the spectral norm of the matrix $A$. We also show that this algorithm can be used to give faster algorithms for kernel ridge regression. Finally, we show that the sketch size required for our algorithm is essentially optimal for a natural framework of algorithms for ridge regression by proving lower bounds on oblivious sketching matrices for AMM. The sketch size lower bounds for AMM may be of independent interest.}\n}", "pdf": "https://proceedings.mlr.press/v162/kacham22a/kacham22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/kacham22a-supp.zip", "pdf_size": 520811, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6352039928520715641&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Computer Science Department, Carnegie Mellon University; Computer Science Department, Carnegie Mellon University", "aff_domain": "cs.cmu.edu;andrew.cmu.edu", "email": "cs.cmu.edu;andrew.cmu.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/kacham22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "SkexGen: Autoregressive Generation of CAD Construction Sequences with Disentangled Codebooks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18217", "id": "18217", "proceeding": "https://proceedings.mlr.press/v162/xu22k.html", "poster": "/media/PosterPDFs/ICML%202022/b6f0479ae87d244975439c6124592772_ksZ7D4K.png?t=1657746789.5214636", "slides": "", "author_site": "Xiang Xu, Karl Willis, Joseph G Lambourne, Chin-Yi Cheng, Pradeep Kumar Jayaraman, Yasutaka Furukawa", "author": "Xiang Xu; Karl D.D. Willis; Joseph G Lambourne; Chin-Yi Cheng; Pradeep Kumar Jayaraman; Yasutaka Furukawa", "abstract": "We present SkexGen, a novel autoregressive generative model for computer-aided design (CAD) construction sequences containing sketch-and-extrude modeling operations. Our model utilizes distinct Transformer architectures to encode topological, geometric, and extrusion variations of construction sequences into disentangled codebooks. Autoregressive Transformer decoders generate CAD construction sequences sharing certain properties specified by the codebook vectors. Extensive experiments demonstrate that our disentangled codebook representation generates diverse and high-quality CAD models, enhances user control, and enables efficient exploration of the design space. The code is available at https://samxuxiang.github.io/skexgen.", "bibtex": "@InProceedings{pmlr-v162-xu22k,\n title = \t {{S}kex{G}en: Autoregressive Generation of {CAD} Construction Sequences with Disentangled Codebooks},\n author = {Xu, Xiang and Willis, Karl D.D. and Lambourne, Joseph G and Cheng, Chin-Yi and Jayaraman, Pradeep Kumar and Furukawa, Yasutaka},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24698--24724},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22k/xu22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22k.html},\n abstract = \t {We present SkexGen, a novel autoregressive generative model for computer-aided design (CAD) construction sequences containing sketch-and-extrude modeling operations. Our model utilizes distinct Transformer architectures to encode topological, geometric, and extrusion variations of construction sequences into disentangled codebooks. Autoregressive Transformer decoders generate CAD construction sequences sharing certain properties specified by the codebook vectors. Extensive experiments demonstrate that our disentangled codebook representation generates diverse and high-quality CAD models, enhances user control, and enables efficient exploration of the design space. The code is available at https://samxuxiang.github.io/skexgen.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22k/xu22k.pdf", "supp": "", "pdf_size": 15266328, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17649926719866476829&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Simon Fraser University, Canada; Autodesk Research; Autodesk Research; Autodesk Research; Autodesk Research; Simon Fraser University, Canada", "aff_domain": "sfu.ca; ; ; ; ; ", "email": "sfu.ca; ; ; ; ; ", "github": "https://samxuxiang.github.io/skexgen", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/xu22k.html", "aff_unique_index": "0;1;1;1;1;0", "aff_unique_norm": "Simon Fraser University;Autodesk", "aff_unique_dep": ";Autodesk Research", "aff_unique_url": "https://www.sfu.ca;https://research.autodesk.com", "aff_unique_abbr": "SFU;Autodesk", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0", "aff_country_unique": "Canada;United States" }, { "title": "Skin Deep Unlearning: Artefact and Instrument Debiasing in the Context of Melanoma Classification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15965", "id": "15965", "proceeding": "https://proceedings.mlr.press/v162/bevan22a.html", "poster": "/media/PosterPDFs/ICML%202022/996a7fa078cc36c46d02f9af3bef918b.png?t=1657295173.003019", "slides": "", "author_site": "Peter J. Bevan, Amir Atapour-Abarghouei", "author": "Peter Bevan; Amir Atapour-Abarghouei", "abstract": "Convolutional Neural Networks have demonstrated dermatologist-level performance in the classification of melanoma from skin lesion images, but prediction irregularities due to biases seen within the training data are an issue that should be addressed before widespread deployment is possible. In this work, we robustly remove bias and spurious variation from an automated melanoma classification pipeline using two leading bias unlearning techniques. We show that the biases introduced by surgical markings and rulers presented in previous studies can be reasonably mitigated using these bias removal methods. We also demonstrate the generalisation benefits of unlearning spurious variation relating to the imaging instrument used to capture lesion images. Our experimental results provide evidence that the effects of each of the aforementioned biases are notably reduced, with different debiasing techniques excelling at different tasks.", "bibtex": "@InProceedings{pmlr-v162-bevan22a,\n title = \t {Skin Deep Unlearning: Artefact and Instrument Debiasing in the Context of Melanoma Classification},\n author = {Bevan, Peter and Atapour-Abarghouei, Amir},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1874--1892},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bevan22a/bevan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bevan22a.html},\n abstract = \t {Convolutional Neural Networks have demonstrated dermatologist-level performance in the classification of melanoma from skin lesion images, but prediction irregularities due to biases seen within the training data are an issue that should be addressed before widespread deployment is possible. In this work, we robustly remove bias and spurious variation from an automated melanoma classification pipeline using two leading bias unlearning techniques. We show that the biases introduced by surgical markings and rulers presented in previous studies can be reasonably mitigated using these bias removal methods. We also demonstrate the generalisation benefits of unlearning spurious variation relating to the imaging instrument used to capture lesion images. Our experimental results provide evidence that the effects of each of the aforementioned biases are notably reduced, with different debiasing techniques excelling at different tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/bevan22a/bevan22a.pdf", "supp": "", "pdf_size": 9613036, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13843943708217895697&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "School of Computing, Newcastle University, Newcastle upon Tyne, UK; Department of Computer Science, Durham University, Durham, UK", "aff_domain": "hotmail.co.uk; ", "email": "hotmail.co.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/bevan22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Newcastle University;Durham University", "aff_unique_dep": "School of Computing;Department of Computer Science", "aff_unique_url": "https://www.ncl.ac.uk;https://www.dur.ac.uk", "aff_unique_abbr": "NU;Durham", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Newcastle upon Tyne;Durham", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Smoothed Adaptive Weighting for Imbalanced Semi-Supervised Learning: Improve Reliability Against Unknown Distribution Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16061", "id": "16061", "proceeding": "https://proceedings.mlr.press/v162/lai22b.html", "poster": "/media/PosterPDFs/ICML%202022/1f0e3dad99908345f7439f8ffabdffc4.png?t=1655013708.60896", "slides": "", "author_site": "Zhengfeng Lai, Chao Wang, Henrry Gunawan, Senching Cheung, Chen-Nee Chuah", "author": "Zhengfeng Lai; Chao Wang; Henrry Gunawan; Sen-Ching S Cheung; Chen-Nee Chuah", "abstract": "Despite recent promising results on semi-supervised learning (SSL), data imbalance, particularly in the unlabeled dataset, could significantly impact the training performance of a SSL algorithm if there is a mismatch between the expected and actual class distributions. The efforts on how to construct a robust SSL framework that can effectively learn from datasets with unknown distributions remain limited. We first investigate the feasibility of adding weights to the consistency loss and then we verify the necessity of smoothed weighting schemes. Based on this study, we propose a self-adaptive algorithm, named Smoothed Adaptive Weighting (SAW). SAW is designed to enhance the robustness of SSL by estimating the learning difficulty of each class and synthesizing the weights in the consistency loss based on such estimation. We show that SAW can complement recent consistency-based SSL algorithms and improve their reliability on various datasets including three standard datasets and one gigapixel medical imaging application without making any assumptions about the distribution of the unlabeled set.", "bibtex": "@InProceedings{pmlr-v162-lai22b,\n title = \t {Smoothed Adaptive Weighting for Imbalanced Semi-Supervised Learning: Improve Reliability Against Unknown Distribution Data},\n author = {Lai, Zhengfeng and Wang, Chao and Gunawan, Henrry and Cheung, Sen-Ching S and Chuah, Chen-Nee},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11828--11843},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lai22b/lai22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/lai22b.html},\n abstract = \t {Despite recent promising results on semi-supervised learning (SSL), data imbalance, particularly in the unlabeled dataset, could significantly impact the training performance of a SSL algorithm if there is a mismatch between the expected and actual class distributions. The efforts on how to construct a robust SSL framework that can effectively learn from datasets with unknown distributions remain limited. We first investigate the feasibility of adding weights to the consistency loss and then we verify the necessity of smoothed weighting schemes. Based on this study, we propose a self-adaptive algorithm, named Smoothed Adaptive Weighting (SAW). SAW is designed to enhance the robustness of SSL by estimating the learning difficulty of each class and synthesizing the weights in the consistency loss based on such estimation. We show that SAW can complement recent consistency-based SSL algorithms and improve their reliability on various datasets including three standard datasets and one gigapixel medical imaging application without making any assumptions about the distribution of the unlabeled set.}\n}", "pdf": "https://proceedings.mlr.press/v162/lai22b/lai22b.pdf", "supp": "", "pdf_size": 6775725, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7032330821136017834&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Electrical and Computer Engineering, University of California Davis, Davis, CA, USA+Department of Statistics and Data Science, Southern University of Science and Technology, Shenzhen 518055, P.R. China; Department of Electrical and Computer Engineering, University of California Davis, Davis, CA, USA+Department of Statistics and Data Science, Southern University of Science and Technology, Shenzhen 518055, P.R. China; Department of Electrical and Computer Engineering, University of California Davis, Davis, CA, USA; Department of Electrical and Computer Engineering, University of Kentucky, Lexington, KY, USA; Department of Electrical and Computer Engineering, University of California Davis, Davis, CA, USA", "aff_domain": "ucdavis.edu; ; ; ; ", "email": "ucdavis.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/lai22b.html", "aff_unique_index": "0+1;0+1;0;2;0", "aff_unique_norm": "University of California, Davis;Southern University of Science and Technology;University of Kentucky", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Statistics and Data Science;Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.ucdavis.edu;https://www.sustech.edu.cn;https://www.uky.edu", "aff_unique_abbr": "UC Davis;SUSTech;UK", "aff_campus_unique_index": "0+1;0+1;0;2;0", "aff_campus_unique": "Davis;Shenzhen;Lexington", "aff_country_unique_index": "0+1;0+1;0;0;0", "aff_country_unique": "United States;China" }, { "title": "Smoothed Adversarial Linear Contextual Bandits with Knapsacks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17423", "id": "17423", "proceeding": "https://proceedings.mlr.press/v162/sivakumar22a.html", "poster": "", "slides": "", "author_site": "Vidyashankar Sivakumar, Shiliang Zuo, Arindam Banerjee", "author": "Vidyashankar Sivakumar; Shiliang Zuo; Arindam Banerjee", "abstract": "Many bandit problems are characterized by the learner making decisions under constraints. The learner in Linear Contextual Bandits with Knapsacks (LinCBwK) receives a resource consumption vector in addition to a scalar reward in each time step which are both linear functions of the context corresponding to the chosen arm. For a fixed time horizon $T$, the goal of the learner is to maximize rewards while ensuring resource consumptions do not exceed a pre-specified budget. We present algorithms and characterize regret for LinCBwK in the smoothed setting where base context vectors are assumed to be perturbed by Gaussian noise. We consider both the stochastic and adversarial settings for the base contexts, and our analysis of stochastic LinCBwK can be viewed as a warm-up to the more challenging adversarial LinCBwK. For the stochastic setting, we obtain $O(\\sqrt{T})$ additive regret bounds compared to the best context dependent fixed policy. The analysis combines ideas for greedy parameter estimation in \\cite{kmrw18, siwb20} and the primal-dual paradigm first explored in \\cite{agde17, agde14}. Our main contribution is an algorithm with $O(\\log T)$ competitive ratio relative to the best context dependent fixed policy for the adversarial setting. The algorithm for the adversarial setting employs ideas from the primal-dual framework \\cite{agde17, agde14} and a novel adaptation of the doubling trick \\cite{isss19}.", "bibtex": "@InProceedings{pmlr-v162-sivakumar22a,\n title = \t {Smoothed Adversarial Linear Contextual Bandits with Knapsacks},\n author = {Sivakumar, Vidyashankar and Zuo, Shiliang and Banerjee, Arindam},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20253--20277},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sivakumar22a/sivakumar22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sivakumar22a.html},\n abstract = \t {Many bandit problems are characterized by the learner making decisions under constraints. The learner in Linear Contextual Bandits with Knapsacks (LinCBwK) receives a resource consumption vector in addition to a scalar reward in each time step which are both linear functions of the context corresponding to the chosen arm. For a fixed time horizon $T$, the goal of the learner is to maximize rewards while ensuring resource consumptions do not exceed a pre-specified budget. We present algorithms and characterize regret for LinCBwK in the smoothed setting where base context vectors are assumed to be perturbed by Gaussian noise. We consider both the stochastic and adversarial settings for the base contexts, and our analysis of stochastic LinCBwK can be viewed as a warm-up to the more challenging adversarial LinCBwK. For the stochastic setting, we obtain $O(\\sqrt{T})$ additive regret bounds compared to the best context dependent fixed policy. The analysis combines ideas for greedy parameter estimation in \\cite{kmrw18, siwb20} and the primal-dual paradigm first explored in \\cite{agde17, agde14}. Our main contribution is an algorithm with $O(\\log T)$ competitive ratio relative to the best context dependent fixed policy for the adversarial setting. The algorithm for the adversarial setting employs ideas from the primal-dual framework \\cite{agde17, agde14} and a novel adaptation of the doubling trick \\cite{isss19}.}\n}", "pdf": "https://proceedings.mlr.press/v162/sivakumar22a/sivakumar22a.pdf", "supp": "", "pdf_size": 525243, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14894323170570839160&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Amazon; Department of Computer Science, University of Illinois, Urbana-Champaign; Department of Computer Science, University of Illinois, Urbana-Champaign", "aff_domain": "gmail.com;illinois.edu;illinois.edu", "email": "gmail.com;illinois.edu;illinois.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sivakumar22a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Amazon;University of Illinois, Urbana-Champaign", "aff_unique_dep": "Amazon.com, Inc.;Department of Computer Science", "aff_unique_url": "https://www.amazon.com;https://illinois.edu", "aff_unique_abbr": "Amazon;UIUC", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "SoQal: Selective Oracle Questioning for Consistency Based Active Learning of Cardiac Signals", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15969", "id": "15969", "proceeding": "https://proceedings.mlr.press/v162/kiyasseh22a.html", "poster": "", "slides": "", "author_site": "Dani Kiyasseh, Tingting Zhu, David Clifton", "author": "Dani Kiyasseh; Tingting Zhu; David A Clifton", "abstract": "Clinical settings are often characterized by abundant unlabelled data and limited labelled data. This is typically driven by the high burden placed on oracles (e.g., physicians) to provide annotations. One way to mitigate this burden is via active learning (AL) which involves the (a) acquisition and (b) annotation of informative unlabelled instances. Whereas previous work addresses either one of these elements independently, we propose an AL framework that addresses both. For acquisition, we propose Bayesian Active Learning by Consistency (BALC), a sub-framework which perturbs both instances and network parameters and quantifies changes in the network output probability distribution. For annotation, we propose SoQal, a sub-framework that dynamically determines whether, for each acquired unlabelled instance, to request a label from an oracle or to pseudo-label it instead. We show that BALC can outperform start-of-the-art acquisition functions such as BALD, and SoQal outperforms baseline methods even in the presence of a noisy oracle.", "bibtex": "@InProceedings{pmlr-v162-kiyasseh22a,\n title = \t {{S}o{Q}al: Selective Oracle Questioning for Consistency Based Active Learning of Cardiac Signals},\n author = {Kiyasseh, Dani and Zhu, Tingting and Clifton, David A},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11302--11340},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kiyasseh22a/kiyasseh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kiyasseh22a.html},\n abstract = \t {Clinical settings are often characterized by abundant unlabelled data and limited labelled data. This is typically driven by the high burden placed on oracles (e.g., physicians) to provide annotations. One way to mitigate this burden is via active learning (AL) which involves the (a) acquisition and (b) annotation of informative unlabelled instances. Whereas previous work addresses either one of these elements independently, we propose an AL framework that addresses both. For acquisition, we propose Bayesian Active Learning by Consistency (BALC), a sub-framework which perturbs both instances and network parameters and quantifies changes in the network output probability distribution. For annotation, we propose SoQal, a sub-framework that dynamically determines whether, for each acquired unlabelled instance, to request a label from an oracle or to pseudo-label it instead. We show that BALC can outperform start-of-the-art acquisition functions such as BALD, and SoQal outperforms baseline methods even in the presence of a noisy oracle.}\n}", "pdf": "https://proceedings.mlr.press/v162/kiyasseh22a/kiyasseh22a.pdf", "supp": "", "pdf_size": 7913727, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=944185574108691775&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computing and Mathematical Sciences, California Institute of Technology, California, USA; Department of Engineering Science, University of Oxford, Oxford, UK; Department of Engineering Science, University of Oxford, Oxford, UK", "aff_domain": "caltech.edu; ; ", "email": "caltech.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kiyasseh22a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "California Institute of Technology;University of Oxford", "aff_unique_dep": "Department of Computing and Mathematical Sciences;Department of Engineering Science", "aff_unique_url": "https://www.caltech.edu;https://www.ox.ac.uk", "aff_unique_abbr": "Caltech;Oxford", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "California;Oxford", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Soft Truncation: A Universal Training Technique of Score-based Diffusion Model for High Precision Score Estimation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17647", "id": "17647", "proceeding": "https://proceedings.mlr.press/v162/kim22i.html", "poster": "/media/PosterPDFs/ICML%202022/c7af0926b294e47e52e46cfebe173f20.png?t=1657596411.5606146", "slides": "", "author_site": "Dongjun Kim, Seungjae Shin, Kyungwoo Song, Wanmo Kang, IL CHUL MOON", "author": "Dongjun Kim; Seungjae Shin; Kyungwoo Song; Wanmo Kang; Il-Chul Moon", "abstract": "Recent advances in diffusion models bring state-of-the-art performance on image generation tasks. However, empirical results from previous research in diffusion models imply an inverse correlation between density estimation and sample generation performances. This paper investigates with sufficient empirical evidence that such inverse correlation happens because density estimation is significantly contributed by small diffusion time, whereas sample generation mainly depends on large diffusion time. However, training a score network well across the entire diffusion time is demanding because the loss scale is significantly imbalanced at each diffusion time. For successful training, therefore, we introduce Soft Truncation, a universally applicable training technique for diffusion models, that softens the fixed and static truncation hyperparameter into a random variable. In experiments, Soft Truncation achieves state-of-the-art performance on CIFAR-10, CelebA, CelebA-HQ $256\\times 256$, and STL-10 datasets.", "bibtex": "@InProceedings{pmlr-v162-kim22i,\n title = \t {Soft Truncation: A Universal Training Technique of Score-based Diffusion Model for High Precision Score Estimation},\n author = {Kim, Dongjun and Shin, Seungjae and Song, Kyungwoo and Kang, Wanmo and Moon, Il-Chul},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11201--11228},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kim22i/kim22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/kim22i.html},\n abstract = \t {Recent advances in diffusion models bring state-of-the-art performance on image generation tasks. However, empirical results from previous research in diffusion models imply an inverse correlation between density estimation and sample generation performances. This paper investigates with sufficient empirical evidence that such inverse correlation happens because density estimation is significantly contributed by small diffusion time, whereas sample generation mainly depends on large diffusion time. However, training a score network well across the entire diffusion time is demanding because the loss scale is significantly imbalanced at each diffusion time. For successful training, therefore, we introduce Soft Truncation, a universally applicable training technique for diffusion models, that softens the fixed and static truncation hyperparameter into a random variable. In experiments, Soft Truncation achieves state-of-the-art performance on CIFAR-10, CelebA, CelebA-HQ $256\\times 256$, and STL-10 datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/kim22i/kim22i.pdf", "supp": "", "pdf_size": 17852546, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=547732243097530529&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "KAIST, South Korea; KAIST, South Korea; University of Seoul, South Korea; KAIST, South Korea; KAIST, South Korea+Summary.AI", "aff_domain": "kaist.ac.kr; ; ; ; ", "email": "kaist.ac.kr; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/kim22i.html", "aff_unique_index": "0;0;1;0;0+2", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Seoul;Summary AI", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kaist.ac.kr;http://www.useoul.edu;", "aff_unique_abbr": "KAIST;UOS;Summary AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea;" }, { "title": "Solving Stackelberg Prediction Game with Least Squares Loss via Spherically Constrained Least Squares Reformulation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17691", "id": "17691", "proceeding": "https://proceedings.mlr.press/v162/wang22g.html", "poster": "/media/PosterPDFs/ICML%202022/0bf727e907c5fc9d5356f11e4c45d613.png?t=1656557179.4756894", "slides": "/media/icml-2022/Slides/17691.pdf", "author_site": "Jiali Wang, Wen Huang, Rujun Jiang, Xudong Li, Alex Wang", "author": "Jiali Wang; Wen Huang; Rujun Jiang; Xudong Li; Alex L Wang", "abstract": "The Stackelberg prediction game (SPG) is popular in characterizing strategic interactions between a learner and an attacker. As an important special case, the SPG with least squares loss (SPG-LS) has recently received much research attention. Although initially formulated as a difficult bi-level optimization problem, SPG-LS admits tractable reformulations which can be polynomially globally solved by semidefinite programming or second order cone programming. However, all the available approaches are not well-suited for handling large-scale datasets, especially those with huge numbers of features. In this paper, we explore an alternative reformulation of the SPG-LS. By a novel nonlinear change of variables, we rewrite the SPG-LS as a spherically constrained least squares (SCLS) problem. Theoretically, we show that an $\\epsilon$ optimal solutions to the SCLS (and the SPG-LS) can be achieved in $\\tilde O(N/\\sqrt{\\epsilon})$ floating-point operations, where $N$ is the number of nonzero entries in the data matrix. Practically, we apply two well-known methods for solving this new reformulation, i.e., the Krylov subspace method and the Riemannian trust region method. Both algorithms are factorization free so that they are suitable for solving large scale problems. Numerical results on both synthetic and real-world datasets indicate that the SPG-LS, equipped with the SCLS reformulation, can be solved orders of magnitude faster than the state of the art.", "bibtex": "@InProceedings{pmlr-v162-wang22g,\n title = \t {Solving Stackelberg Prediction Game with Least Squares Loss via Spherically Constrained Least Squares Reformulation},\n author = {Wang, Jiali and Huang, Wen and Jiang, Rujun and Li, Xudong and Wang, Alex L},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22665--22679},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22g/wang22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22g.html},\n abstract = \t {The Stackelberg prediction game (SPG) is popular in characterizing strategic interactions between a learner and an attacker. As an important special case, the SPG with least squares loss (SPG-LS) has recently received much research attention. Although initially formulated as a difficult bi-level optimization problem, SPG-LS admits tractable reformulations which can be polynomially globally solved by semidefinite programming or second order cone programming. However, all the available approaches are not well-suited for handling large-scale datasets, especially those with huge numbers of features. In this paper, we explore an alternative reformulation of the SPG-LS. By a novel nonlinear change of variables, we rewrite the SPG-LS as a spherically constrained least squares (SCLS) problem. Theoretically, we show that an $\\epsilon$ optimal solutions to the SCLS (and the SPG-LS) can be achieved in $\\tilde O(N/\\sqrt{\\epsilon})$ floating-point operations, where $N$ is the number of nonzero entries in the data matrix. Practically, we apply two well-known methods for solving this new reformulation, i.e., the Krylov subspace method and the Riemannian trust region method. Both algorithms are factorization free so that they are suitable for solving large scale problems. Numerical results on both synthetic and real-world datasets indicate that the SPG-LS, equipped with the SCLS reformulation, can be solved orders of magnitude faster than the state of the art.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22g/wang22g.pdf", "supp": "", "pdf_size": 571711, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3554319006994988140&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "School of Data Science, Fudan University, China; School of Mathematical Sciences, Xiamen University, China; School of Data Science, Fudan University, China; School of Data Science, Fudan University, China; School of Computer Science, Carnegie Mellon University, USA", "aff_domain": "fudan.edu.cn; ; ; ; ", "email": "fudan.edu.cn; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/wang22g.html", "aff_unique_index": "0;1;0;0;2", "aff_unique_norm": "Fudan University;Xiamen University;Carnegie Mellon University", "aff_unique_dep": "School of Data Science;School of Mathematical Sciences;School of Computer Science", "aff_unique_url": "https://www.fudan.edu.cn;https://www.xmu.edu.cn;https://www.cmu.edu", "aff_unique_abbr": "Fudan;XMU;CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "SpaceMAP: Visualizing High-Dimensional Data by Space Expansion", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18169", "id": "18169", "proceeding": "https://proceedings.mlr.press/v162/zu22a.html", "poster": "/media/PosterPDFs/ICML%202022/d0cffa36e832b65fe145a6e9360bda83_fRdTBSi.png?t=1657893027.72574", "slides": "", "author_site": "Xinrui Zu, Qian Tao", "author": "Xinrui Zu; Qian Tao", "abstract": "Dimensionality reduction (DR) of high-dimensional data is of theoretical and practical interest in machine learning. However, there exist intriguing, non-intuitive discrepancies between the geometry of high- and low-dimensional space. We look into such discrepancies and propose a novel visualization method called Space-based Manifold Approximation and Projection (SpaceMAP). Our method establishes an analytical transformation on distance metrics between spaces to address the \u201ccrowding problem\" in DR. With the proposed equivalent extended distance (EED), we are able to match the capacity of high- and low-dimensional space in a principled manner. To handle complex data with different manifold properties, we propose hierarchical manifold approximation to model the similarity function in a data-specific manner. We evaluated SpaceMAP on a range of synthetic and real datasets with varying manifold properties, and demonstrated its excellent performance in comparison with classical and state-of-the-art DR methods. In particular, the concept of space expansion provides a generic framework for understanding nonlinear DR methods including the popular t-distributed Stochastic Neighbor Embedding (t-SNE) and Uniform Manifold Approximation and Projection", "bibtex": "@InProceedings{pmlr-v162-zu22a,\n title = \t {{S}pace{MAP}: Visualizing High-Dimensional Data by Space Expansion},\n author = {Zu, Xinrui and Tao, Qian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27707--27723},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zu22a/zu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zu22a.html},\n abstract = \t {Dimensionality reduction (DR) of high-dimensional data is of theoretical and practical interest in machine learning. However, there exist intriguing, non-intuitive discrepancies between the geometry of high- and low-dimensional space. We look into such discrepancies and propose a novel visualization method called Space-based Manifold Approximation and Projection (SpaceMAP). Our method establishes an analytical transformation on distance metrics between spaces to address the \u201ccrowding problem\" in DR. With the proposed equivalent extended distance (EED), we are able to match the capacity of high- and low-dimensional space in a principled manner. To handle complex data with different manifold properties, we propose hierarchical manifold approximation to model the similarity function in a data-specific manner. We evaluated SpaceMAP on a range of synthetic and real datasets with varying manifold properties, and demonstrated its excellent performance in comparison with classical and state-of-the-art DR methods. In particular, the concept of space expansion provides a generic framework for understanding nonlinear DR methods including the popular t-distributed Stochastic Neighbor Embedding (t-SNE) and Uniform Manifold Approximation and Projection}\n}", "pdf": "https://proceedings.mlr.press/v162/zu22a/zu22a.pdf", "supp": "", "pdf_size": 8282086, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14917682277136941997&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Imaging Physics, Delft University of Technology; Department of Imaging Physics, Delft University of Technology", "aff_domain": "tudelft.nl;tudelft.nl", "email": "tudelft.nl;tudelft.nl", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/zu22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Delft University of Technology", "aff_unique_dep": "Department of Imaging Physics", "aff_unique_url": "https://www.tudelft.nl", "aff_unique_abbr": "TUDelft", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Delft", "aff_country_unique_index": "0;0", "aff_country_unique": "Netherlands" }, { "title": "Sparse Double Descent: Where Network Pruning Aggravates Overfitting", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17249", "id": "17249", "proceeding": "https://proceedings.mlr.press/v162/he22d.html", "poster": "/media/PosterPDFs/ICML%202022/fd4f21f2556dad0ea8b7a5c04eabebda.png?t=1658056996.4509044", "slides": "/media/icml-2022/Slides/17249.pdf", "author_site": "Zheng He, Zeke Xie, Quanzhi Zhu, Zengchang Qin", "author": "Zheng He; Zeke Xie; Quanzhi Zhu; Zengchang Qin", "abstract": "People usually believe that network pruning not only reduces the computational cost of deep networks, but also prevents overfitting by decreasing model capacity. However, our work surprisingly discovers that network pruning sometimes even aggravates overfitting. We report an unexpected sparse double descent phenomenon that, as we increase model sparsity via network pruning, test performance first gets worse (due to overfitting), then gets better (due to relieved overfitting), and gets worse at last (due to forgetting useful information). While recent studies focused on the deep double descent with respect to model overparameterization, they failed to recognize that sparsity may also cause double descent. In this paper, we have three main contributions. First, we report the novel sparse double descent phenomenon through extensive experiments. Second, for this phenomenon, we propose a novel learning distance interpretation that the curve of l2 learning distance of sparse models (from initialized parameters to final parameters) may correlate with the sparse double descent curve well and reflect generalization better than minima flatness. Third, in the context of sparse double descent, a winning ticket in the lottery ticket hypothesis surprisingly may not always win.", "bibtex": "@InProceedings{pmlr-v162-he22d,\n title = \t {Sparse Double Descent: Where Network Pruning Aggravates Overfitting},\n author = {He, Zheng and Xie, Zeke and Zhu, Quanzhi and Qin, Zengchang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8635--8659},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/he22d/he22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/he22d.html},\n abstract = \t {People usually believe that network pruning not only reduces the computational cost of deep networks, but also prevents overfitting by decreasing model capacity. However, our work surprisingly discovers that network pruning sometimes even aggravates overfitting. We report an unexpected sparse double descent phenomenon that, as we increase model sparsity via network pruning, test performance first gets worse (due to overfitting), then gets better (due to relieved overfitting), and gets worse at last (due to forgetting useful information). While recent studies focused on the deep double descent with respect to model overparameterization, they failed to recognize that sparsity may also cause double descent. In this paper, we have three main contributions. First, we report the novel sparse double descent phenomenon through extensive experiments. Second, for this phenomenon, we propose a novel learning distance interpretation that the curve of l2 learning distance of sparse models (from initialized parameters to final parameters) may correlate with the sparse double descent curve well and reflect generalization better than minima flatness. Third, in the context of sparse double descent, a winning ticket in the lottery ticket hypothesis surprisingly may not always win.}\n}", "pdf": "https://proceedings.mlr.press/v162/he22d/he22d.pdf", "supp": "", "pdf_size": 1616572, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13575634226332267218&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Intelligent Computing and Machine Learning Lab, School of ASEE, Beihang University, Beijing, China+The University of Tokyo+RIKEN Center for AIP; The University of Tokyo+RIKEN Center for AIP; Intelligent Computing and Machine Learning Lab, School of ASEE, Beihang University, Beijing, China; Intelligent Computing and Machine Learning Lab, School of ASEE, Beihang University, Beijing, China", "aff_domain": "buaa.edu.cn; ; ;buaa.edu.cn", "email": "buaa.edu.cn; ; ;buaa.edu.cn", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/he22d.html", "aff_unique_index": "0+1+2;1+2;0;0", "aff_unique_norm": "Beihang University;University of Tokyo;RIKEN", "aff_unique_dep": "School of ASEE;;Center for AIP", "aff_unique_url": "http://www.buaa.edu.cn;https://www.u-tokyo.ac.jp;https://www.riken.jp", "aff_unique_abbr": "Beihang;UTokyo;RIKEN", "aff_campus_unique_index": "0;;0;0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0+1+1;1+1;0;0", "aff_country_unique": "China;Japan" }, { "title": "Sparse Invariant Risk Minimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17747", "id": "17747", "proceeding": "https://proceedings.mlr.press/v162/zhou22e.html", "poster": "", "slides": "", "author_site": "Xiao Zhou, Yong LIN, Weizhong Zhang, Tong Zhang", "author": "Xiao Zhou; Yong Lin; Weizhong Zhang; Tong Zhang", "abstract": "Invariant Risk Minimization (IRM) is an emerging invariant feature extracting technique to help generalization with distributional shift. However, we find that there exists a basic and intractable contradiction between the model trainability and generalization ability in IRM. On one hand, recent studies on deep learning theory indicate the importance of large-sized or even overparameterized neural networks to make the model easy to train. On the other hand, unlike empirical risk minimization that can be benefited from overparameterization, our empirical and theoretical analyses show that the generalization ability of IRM is much easier to be demolished by overfitting caused by overparameterization. In this paper, we propose a simple yet effective paradigm named Sparse Invariant Risk Minimization (SparseIRM) to address this contradiction. Our key idea is to employ a global sparsity constraint as a defense to prevent spurious features from leaking in during the whole IRM process. Compared with sparisfy-after-training prototype by prior work which can discard invariant features, the global sparsity constraint limits the budget for feature selection and enforces SparseIRM to select the invariant features. We illustrate the benefit of SparseIRM through a theoretical analysis on a simple linear case. Empirically we demonstrate the power of SparseIRM through various datasets and models and surpass state-of-the-art methods with a gap up to 29%.", "bibtex": "@InProceedings{pmlr-v162-zhou22e,\n title = \t {Sparse Invariant Risk Minimization},\n author = {Zhou, Xiao and Lin, Yong and Zhang, Weizhong and Zhang, Tong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27222--27244},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22e/zhou22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22e.html},\n abstract = \t {Invariant Risk Minimization (IRM) is an emerging invariant feature extracting technique to help generalization with distributional shift. However, we find that there exists a basic and intractable contradiction between the model trainability and generalization ability in IRM. On one hand, recent studies on deep learning theory indicate the importance of large-sized or even overparameterized neural networks to make the model easy to train. On the other hand, unlike empirical risk minimization that can be benefited from overparameterization, our empirical and theoretical analyses show that the generalization ability of IRM is much easier to be demolished by overfitting caused by overparameterization. In this paper, we propose a simple yet effective paradigm named Sparse Invariant Risk Minimization (SparseIRM) to address this contradiction. Our key idea is to employ a global sparsity constraint as a defense to prevent spurious features from leaking in during the whole IRM process. Compared with sparisfy-after-training prototype by prior work which can discard invariant features, the global sparsity constraint limits the budget for feature selection and enforces SparseIRM to select the invariant features. We illustrate the benefit of SparseIRM through a theoretical analysis on a simple linear case. Empirically we demonstrate the power of SparseIRM through various datasets and models and surpass state-of-the-art methods with a gap up to 29%.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22e/zhou22e.pdf", "supp": "", "pdf_size": 2001421, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11225567002410127555&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "The Hong Kong University of Science and Technology; The Hong Kong University of Science and Technology; The Hong Kong University of Science and Technology; Google Research", "aff_domain": "ust.hk;ust.hk;ust.hk;tongzhang-ml.org", "email": "ust.hk;ust.hk;ust.hk;tongzhang-ml.org", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhou22e.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.ust.hk;https://research.google", "aff_unique_abbr": "HKUST;Google Research", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Hong Kong SAR;Mountain View", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Sparse Mixed Linear Regression with Guarantees: Taming an Intractable Problem with Invex Relaxation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17829", "id": "17829", "proceeding": "https://proceedings.mlr.press/v162/barik22a.html", "poster": "/media/PosterPDFs/ICML%202022/f900afc719d7d86bd7d17b9d3be0aafe.png?t=1657844107.2974515", "slides": "", "author_site": "Adarsh Barik, Jean Honorio", "author": "Adarsh Barik; Jean Honorio", "abstract": "In this paper, we study the problem of sparse mixed linear regression on an unlabeled dataset that is generated from linear measurements from two different regression parameter vectors. Since the data is unlabeled, our task is to not only figure out a good approximation of regression parameter vectors but also label the dataset correctly. In its original form, this problem is NP-hard. The most popular algorithms to solve this problem (such as Expectation-Maximization) have a tendency to stuck at local minima. We provide a novel invex relaxation for this intractable problem which leads to a solution with provable theoretical guarantees. This relaxation enables exact recovery of data labels. Furthermore, we recover close approximation of regression parameter vectors which match the true parameter vectors in support and sign. Our formulation uses a carefully constructed primal dual witnesses framework for the invex problem. Furthermore, we show that the sample complexity of our method is only logarithmic in terms of the dimension of the regression parameter vectors.", "bibtex": "@InProceedings{pmlr-v162-barik22a,\n title = \t {Sparse Mixed Linear Regression with Guarantees: Taming an Intractable Problem with Invex Relaxation},\n author = {Barik, Adarsh and Honorio, Jean},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1627--1646},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/barik22a/barik22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/barik22a.html},\n abstract = \t {In this paper, we study the problem of sparse mixed linear regression on an unlabeled dataset that is generated from linear measurements from two different regression parameter vectors. Since the data is unlabeled, our task is to not only figure out a good approximation of regression parameter vectors but also label the dataset correctly. In its original form, this problem is NP-hard. The most popular algorithms to solve this problem (such as Expectation-Maximization) have a tendency to stuck at local minima. We provide a novel invex relaxation for this intractable problem which leads to a solution with provable theoretical guarantees. This relaxation enables exact recovery of data labels. Furthermore, we recover close approximation of regression parameter vectors which match the true parameter vectors in support and sign. Our formulation uses a carefully constructed primal dual witnesses framework for the invex problem. Furthermore, we show that the sample complexity of our method is only logarithmic in terms of the dimension of the regression parameter vectors.}\n}", "pdf": "https://proceedings.mlr.press/v162/barik22a/barik22a.pdf", "supp": "", "pdf_size": 757705, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5602192829478945425&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Purdue University; Department of Computer Science, Purdue University", "aff_domain": "purdue.edu;purdue.edu", "email": "purdue.edu;purdue.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/barik22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Sparsity in Partially Controllable Linear Systems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16911", "id": "16911", "proceeding": "https://proceedings.mlr.press/v162/efroni22b.html", "poster": "", "slides": "", "author_site": "Yonathan Efroni, Sham Kakade, Akshay Krishnamurthy, Cyril Zhang", "author": "Yonathan Efroni; Sham Kakade; Akshay Krishnamurthy; Cyril Zhang", "abstract": "A fundamental concept in control theory is that of controllability, where any system state can be reached through an appropriate choice of control inputs. Indeed, a large body of classical and modern approaches are designed for controllable linear dynamical systems. However, in practice, we often encounter systems in which a large set of state variables evolve exogenously and independently of the control inputs; such systems are only partially controllable. The focus of this work is on a large class of partially controllable linear dynamical systems, specified by an underlying sparsity pattern. Our main results establish structural conditions and finite-sample guarantees for learning to control such systems. In particular, our structural results characterize those state variables which are irrelevant for optimal control, an analysis which departs from classical control techniques. Our algorithmic results adapt techniques from high-dimensional statistics{\u2014}specifically soft-thresholding and semiparametric least-squares{\u2014}to exploit the underlying sparsity pattern in order to obtain finite-sample guarantees that significantly improve over those based on certainty-equivalence. We also corroborate these theoretical improvements over certainty-equivalent control through a simulation study.", "bibtex": "@InProceedings{pmlr-v162-efroni22b,\n title = \t {Sparsity in Partially Controllable Linear Systems},\n author = {Efroni, Yonathan and Kakade, Sham and Krishnamurthy, Akshay and Zhang, Cyril},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5851--5860},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/efroni22b/efroni22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/efroni22b.html},\n abstract = \t {A fundamental concept in control theory is that of controllability, where any system state can be reached through an appropriate choice of control inputs. Indeed, a large body of classical and modern approaches are designed for controllable linear dynamical systems. However, in practice, we often encounter systems in which a large set of state variables evolve exogenously and independently of the control inputs; such systems are only partially controllable. The focus of this work is on a large class of partially controllable linear dynamical systems, specified by an underlying sparsity pattern. Our main results establish structural conditions and finite-sample guarantees for learning to control such systems. In particular, our structural results characterize those state variables which are irrelevant for optimal control, an analysis which departs from classical control techniques. Our algorithmic results adapt techniques from high-dimensional statistics{\u2014}specifically soft-thresholding and semiparametric least-squares{\u2014}to exploit the underlying sparsity pattern in order to obtain finite-sample guarantees that significantly improve over those based on certainty-equivalence. We also corroborate these theoretical improvements over certainty-equivalent control through a simulation study.}\n}", "pdf": "https://proceedings.mlr.press/v162/efroni22b/efroni22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/efroni22b-supp.zip", "pdf_size": 340237, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12915031028027795512&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Microsoft Research; Microsoft Research; Microsoft Research; Microsoft Research", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/efroni22b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Spatial-Channel Token Distillation for Vision MLPs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18389", "id": "18389", "proceeding": "https://proceedings.mlr.press/v162/li22c.html", "poster": "/media/PosterPDFs/ICML%202022/c9f0f895fb98ab9159f51fd0297e236d.png?t=1657782430.2942555", "slides": "", "author_site": "Yanxi Li, Xinghao Chen, Minjing Dong, Yehui Tang, Yunhe Wang, Chang Xu", "author": "Yanxi Li; Xinghao Chen; Minjing Dong; Yehui Tang; Yunhe Wang; Chang Xu", "abstract": "Recently, neural architectures with all Multi-layer Perceptrons (MLPs) have attracted great research interest from the computer vision community. However, the inefficient mixing of spatial-channel information causes MLP-like vision models to demand tremendous pre-training on large-scale datasets. This work solves the problem from a novel knowledge distillation perspective. We propose a novel Spatial-channel Token Distillation (STD) method, which improves the information mixing in the two dimensions by introducing distillation tokens to each of them. A mutual information regularization is further introduced to let distillation tokens focus on their specific dimensions and maximize the performance gain. Extensive experiments on ImageNet for several MLP-like architectures demonstrate that the proposed token distillation mechanism can efficiently improve the accuracy. For example, the proposed STD boosts the top-1 accuracy of Mixer-S16 on ImageNet from 73.8% to 75.7% without any costly pre-training on JFT-300M. When applied to stronger architectures, e.g. CycleMLP-B1 and CycleMLP-B2, STD can still harvest about 1.1% and 0.5% accuracy gains, respectively.", "bibtex": "@InProceedings{pmlr-v162-li22c,\n title = \t {Spatial-Channel Token Distillation for Vision {MLP}s},\n author = {Li, Yanxi and Chen, Xinghao and Dong, Minjing and Tang, Yehui and Wang, Yunhe and Xu, Chang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12685--12695},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/li22c/li22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/li22c.html},\n abstract = \t {Recently, neural architectures with all Multi-layer Perceptrons (MLPs) have attracted great research interest from the computer vision community. However, the inefficient mixing of spatial-channel information causes MLP-like vision models to demand tremendous pre-training on large-scale datasets. This work solves the problem from a novel knowledge distillation perspective. We propose a novel Spatial-channel Token Distillation (STD) method, which improves the information mixing in the two dimensions by introducing distillation tokens to each of them. A mutual information regularization is further introduced to let distillation tokens focus on their specific dimensions and maximize the performance gain. Extensive experiments on ImageNet for several MLP-like architectures demonstrate that the proposed token distillation mechanism can efficiently improve the accuracy. For example, the proposed STD boosts the top-1 accuracy of Mixer-S16 on ImageNet from 73.8% to 75.7% without any costly pre-training on JFT-300M. When applied to stronger architectures, e.g. CycleMLP-B1 and CycleMLP-B2, STD can still harvest about 1.1% and 0.5% accuracy gains, respectively.}\n}", "pdf": "https://proceedings.mlr.press/v162/li22c/li22c.pdf", "supp": "", "pdf_size": 379595, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14679613083763785762&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": "School of Computer Science, University of Sydney, Australia+Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; School of Computer Science, University of Sydney, Australia+Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab+School of Artificial Intelligence, Peking University; Huawei Noah\u2019s Ark Lab; School of Computer Science, University of Sydney, Australia", "aff_domain": "sydney.edu.au;huawei.com;sydney.edu.au;pku.edu.cn;huawei.com;sydney.edu.au", "email": "sydney.edu.au;huawei.com;sydney.edu.au;pku.edu.cn;huawei.com;sydney.edu.au", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/li22c.html", "aff_unique_index": "0+1;1;0+1;1+2;1;0", "aff_unique_norm": "University of Sydney;Huawei;Peking University", "aff_unique_dep": "School of Computer Science;Noah\u2019s Ark Lab;School of Artificial Intelligence", "aff_unique_url": "https://www.sydney.edu.au;https://www.huawei.com;http://www.pku.edu.cn", "aff_unique_abbr": "USYD;Huawei;PKU", "aff_campus_unique_index": "0;0;;0", "aff_campus_unique": "Sydney;", "aff_country_unique_index": "0+1;1;0+1;1+1;1;0", "aff_country_unique": "Australia;China" }, { "title": "Spectral Representation of Robustness Measures for Optimization Under Input Uncertainty", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17741", "id": "17741", "proceeding": "https://proceedings.mlr.press/v162/qing22a.html", "poster": "/media/PosterPDFs/ICML%202022/8004d637b6236202217be3dfcdd8ce59_uglF3G0.png?t=1657574185.1631842", "slides": "", "author_site": "Jixiang Qing, Tom Dhaene, Ivo Couckuyt", "author": "Jixiang Qing; Tom Dhaene; Ivo Couckuyt", "abstract": "We study the inference of mean-variance robustness measures to quantify input uncertainty under the Gaussian Process (GP) framework. These measures are widely used in applications where the robustness of the solution is of interest, for example, in engineering design. While the variance is commonly used to characterize the robustness, Bayesian inference of the variance using GPs is known to be challenging. In this paper, we propose a Spectral Representation of Robustness Measures based on the GP\u2019s spectral representation, i.e., an analytical approach to approximately infer both robustness measures for normal and uniform input uncertainty distributions. We present two approximations based on different Fourier features and compare their accuracy numerically. To demonstrate their utility and efficacy in robust Bayesian Optimization, we integrate the analytical robustness measures in three standard acquisition functions for various robust optimization formulations. We show their competitive performance on numerical benchmarks and real-life applications.", "bibtex": "@InProceedings{pmlr-v162-qing22a,\n title = \t {Spectral Representation of Robustness Measures for Optimization Under Input Uncertainty},\n author = {Qing, Jixiang and Dhaene, Tom and Couckuyt, Ivo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18096--18121},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qing22a/qing22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/qing22a.html},\n abstract = \t {We study the inference of mean-variance robustness measures to quantify input uncertainty under the Gaussian Process (GP) framework. These measures are widely used in applications where the robustness of the solution is of interest, for example, in engineering design. While the variance is commonly used to characterize the robustness, Bayesian inference of the variance using GPs is known to be challenging. In this paper, we propose a Spectral Representation of Robustness Measures based on the GP\u2019s spectral representation, i.e., an analytical approach to approximately infer both robustness measures for normal and uniform input uncertainty distributions. We present two approximations based on different Fourier features and compare their accuracy numerically. To demonstrate their utility and efficacy in robust Bayesian Optimization, we integrate the analytical robustness measures in three standard acquisition functions for various robust optimization formulations. We show their competitive performance on numerical benchmarks and real-life applications.}\n}", "pdf": "https://proceedings.mlr.press/v162/qing22a/qing22a.pdf", "supp": "", "pdf_size": 9575314, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12847297656425234301&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Ghent University \u2013 imec, IDLab, Department of Information Technology (INTEC), Tech Lane \u2013 Zwijnaarde 126, 9052 Ghent, Belgium; Ghent University \u2013 imec, IDLab, Department of Information Technology (INTEC), Tech Lane \u2013 Zwijnaarde 126, 9052 Ghent, Belgium; Ghent University \u2013 imec, IDLab, Department of Information Technology (INTEC), Tech Lane \u2013 Zwijnaarde 126, 9052 Ghent, Belgium", "aff_domain": "UGent.be; ; ", "email": "UGent.be; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/qing22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Ghent University", "aff_unique_dep": "Department of Information Technology (INTEC)", "aff_unique_url": "https://www.ugent.be", "aff_unique_abbr": "UGent", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Ghent", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Belgium" }, { "title": "SpeqNets: Sparsity-aware permutation-equivariant graph networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17681", "id": "17681", "proceeding": "https://proceedings.mlr.press/v162/morris22a.html", "poster": "/media/PosterPDFs/ICML%202022/5ef78f63ba22e7dfb2fa44613311b932_SrCg0hP.png?t=1657915221.853006", "slides": "/media/icml-2022/Slides/17681.pdf", "author_site": "Christopher Morris, Gaurav Rattan, Sandra Kiefer, Siamak Ravanbakhsh", "author": "Christopher Morris; Gaurav Rattan; Sandra Kiefer; Siamak Ravanbakhsh", "abstract": "While message-passing graph neural networks have clear limitations in approximating permutation-equivariant functions over graphs or general relational data, more expressive, higher-order graph neural networks do not scale to large graphs. They either operate on $k$-order tensors or consider all $k$-node subgraphs, implying an exponential dependence on $k$ in memory requirements, and do not adapt to the sparsity of the graph. By introducing new heuristics for the graph isomorphism problem, we devise a class of universal, permutation-equivariant graph networks, which, unlike previous architectures, offer a fine-grained control between expressivity and scalability and adapt to the sparsity of the graph. These architectures lead to vastly reduced computation times compared to standard higher-order graph networks in the supervised node- and graph-level classification and regression regime while significantly improving standard graph neural network and graph kernel architectures in terms of predictive performance.", "bibtex": "@InProceedings{pmlr-v162-morris22a,\n title = \t {{S}peq{N}ets: Sparsity-aware permutation-equivariant graph networks},\n author = {Morris, Christopher and Rattan, Gaurav and Kiefer, Sandra and Ravanbakhsh, Siamak},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16017--16042},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/morris22a/morris22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/morris22a.html},\n abstract = \t {While message-passing graph neural networks have clear limitations in approximating permutation-equivariant functions over graphs or general relational data, more expressive, higher-order graph neural networks do not scale to large graphs. They either operate on $k$-order tensors or consider all $k$-node subgraphs, implying an exponential dependence on $k$ in memory requirements, and do not adapt to the sparsity of the graph. By introducing new heuristics for the graph isomorphism problem, we devise a class of universal, permutation-equivariant graph networks, which, unlike previous architectures, offer a fine-grained control between expressivity and scalability and adapt to the sparsity of the graph. These architectures lead to vastly reduced computation times compared to standard higher-order graph networks in the supervised node- and graph-level classification and regression regime while significantly improving standard graph neural network and graph kernel architectures in terms of predictive performance.}\n}", "pdf": "https://proceedings.mlr.press/v162/morris22a/morris22a.pdf", "supp": "", "pdf_size": 524547, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18273879943488078405&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, RWTH Aachen University, Aachen, Germany+Department of Computer Science, McGill University, Montreal, Canada+Mila, Quebec AI Institute; Department of Computer Science, RWTH Aachen University, Aachen, Germany; Max Planck Institute for Software Systems, Saarland Informatics Campus, Germany; Department of Computer Science, McGill University, Montreal, Canada+Mila, Quebec AI Institute", "aff_domain": "christophermorris.info; ;mpi-sws.org; ", "email": "christophermorris.info; ;mpi-sws.org; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/morris22a.html", "aff_unique_index": "0+1+2;0;3;1+2", "aff_unique_norm": "RWTH Aachen University;McGill University;Quebec AI Institute;Max Planck Institute for Software Systems", "aff_unique_dep": "Department of Computer Science;Department of Computer Science;AI Institute;", "aff_unique_url": "https://www.rwth-aachen.de;https://www.mcgill.ca;https://www.mila.quebec;https://www.mpi-sws.org", "aff_unique_abbr": "RWTH;McGill;Mila;MPI-SWS", "aff_campus_unique_index": "0+1+2;0;3;1+2", "aff_campus_unique": "Aachen;Montreal;Quebec;Saarland", "aff_country_unique_index": "0+1+1;0;0;1+1", "aff_country_unique": "Germany;Canada" }, { "title": "Stability Based Generalization Bounds for Exponential Family Langevin Dynamics", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16621", "id": "16621", "proceeding": "https://proceedings.mlr.press/v162/banerjee22a.html", "poster": "/media/PosterPDFs/ICML%202022/61bdf049525b7d4c2cf79257ec7c2c56.png?t=1657821496.2538016", "slides": "", "author_site": "Arindam Banerjee, Tiancong Chen, Xinyan Li, Yingxue Zhou", "author": "Arindam Banerjee; Tiancong Chen; Xinyan Li; Yingxue Zhou", "abstract": "Recent years have seen advances in generalization bounds for noisy stochastic algorithms, especially stochastic gradient Langevin dynamics (SGLD) based on stability (Mou et al., 2018; Li et al., 2020) and information theoretic approaches (Xu & Raginsky, 2017; Negrea et al., 2019; Steinke & Zakynthinou, 2020). In this paper, we unify and substantially generalize stability based generalization bounds and make three technical contributions. First, we bound the generalization error in terms of expected (not uniform) stability which arguably leads to quantitatively sharper bounds. Second, as our main contribution, we introduce Exponential Family Langevin Dynamics (EFLD), a substantial generalization of SGLD, which includes noisy versions of Sign-SGD and quantized SGD as special cases. We establish data dependent expected stability based generalization bounds for any EFLD algorithm with a O(1/n) sample dependence and dependence on gradient discrepancy rather than the norm of gradients, yielding significantly sharper bounds. Third, we establish optimization guarantees for special cases of EFLD. Further, empirical results on benchmarks illustrate that our bounds are non-vacuous, quantitatively sharper than existing bounds, and behave correctly under noisy labels.", "bibtex": "@InProceedings{pmlr-v162-banerjee22a,\n title = \t {Stability Based Generalization Bounds for Exponential Family {L}angevin Dynamics},\n author = {Banerjee, Arindam and Chen, Tiancong and Li, Xinyan and Zhou, Yingxue},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1412--1449},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/banerjee22a/banerjee22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/banerjee22a.html},\n abstract = \t {Recent years have seen advances in generalization bounds for noisy stochastic algorithms, especially stochastic gradient Langevin dynamics (SGLD) based on stability (Mou et al., 2018; Li et al., 2020) and information theoretic approaches (Xu & Raginsky, 2017; Negrea et al., 2019; Steinke & Zakynthinou, 2020). In this paper, we unify and substantially generalize stability based generalization bounds and make three technical contributions. First, we bound the generalization error in terms of expected (not uniform) stability which arguably leads to quantitatively sharper bounds. Second, as our main contribution, we introduce Exponential Family Langevin Dynamics (EFLD), a substantial generalization of SGLD, which includes noisy versions of Sign-SGD and quantized SGD as special cases. We establish data dependent expected stability based generalization bounds for any EFLD algorithm with a O(1/n) sample dependence and dependence on gradient discrepancy rather than the norm of gradients, yielding significantly sharper bounds. Third, we establish optimization guarantees for special cases of EFLD. Further, empirical results on benchmarks illustrate that our bounds are non-vacuous, quantitatively sharper than existing bounds, and behave correctly under noisy labels.}\n}", "pdf": "https://proceedings.mlr.press/v162/banerjee22a/banerjee22a.pdf", "supp": "", "pdf_size": 2320414, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4544110063909364120&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, University of Illinois Urbana-Champaign; Department of Computer Science, University of Minnesota, Twin Cities; Department of Computer Science, University of Minnesota, Twin Cities; Department of Computer Science, University of Minnesota, Twin Cities", "aff_domain": "umn.edu; ; ; ", "email": "umn.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/banerjee22a.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Minnesota", "aff_unique_dep": "Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://illinois.edu;https://www.minnesota.edu", "aff_unique_abbr": "UIUC;UMN", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Urbana-Champaign;Twin Cities", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Stabilizing Off-Policy Deep Reinforcement Learning from Pixels", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18001", "id": "18001", "proceeding": "https://proceedings.mlr.press/v162/cetin22a.html", "poster": "/media/PosterPDFs/ICML%202022/caf1a3dfb505ffed0d024130f58c5cfa_BpJWwwI.png?t=1657797621.2639067", "slides": "", "author_site": "Edoardo Cetin, Philip Ball, Stephen Roberts, Oya Celiktutan", "author": "Edoardo Cetin; Philip J Ball; Stephen Roberts; Oya Celiktutan", "abstract": "Off-policy reinforcement learning (RL) from pixel observations is notoriously unstable. As a result, many successful algorithms must combine different domain-specific practices and auxiliary losses to learn meaningful behaviors in complex environments. In this work, we provide novel analysis demonstrating that these instabilities arise from performing temporal-difference learning with a convolutional encoder and low-magnitude rewards. We show that this new visual deadly triad causes unstable training and premature convergence to degenerate solutions, a phenomenon we name catastrophic self-overfitting. Based on our analysis, we propose A-LIX, a method providing adaptive regularization to the encoder\u2019s gradients that explicitly prevents the occurrence of catastrophic self-overfitting using a dual objective. By applying A-LIX, we significantly outperform the prior state-of-the-art on the DeepMind Control and Atari benchmarks without any data augmentation or auxiliary losses.", "bibtex": "@InProceedings{pmlr-v162-cetin22a,\n title = \t {Stabilizing Off-Policy Deep Reinforcement Learning from Pixels},\n author = {Cetin, Edoardo and Ball, Philip J and Roberts, Stephen and Celiktutan, Oya},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2784--2810},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cetin22a/cetin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/cetin22a.html},\n abstract = \t {Off-policy reinforcement learning (RL) from pixel observations is notoriously unstable. As a result, many successful algorithms must combine different domain-specific practices and auxiliary losses to learn meaningful behaviors in complex environments. In this work, we provide novel analysis demonstrating that these instabilities arise from performing temporal-difference learning with a convolutional encoder and low-magnitude rewards. We show that this new visual deadly triad causes unstable training and premature convergence to degenerate solutions, a phenomenon we name catastrophic self-overfitting. Based on our analysis, we propose A-LIX, a method providing adaptive regularization to the encoder\u2019s gradients that explicitly prevents the occurrence of catastrophic self-overfitting using a dual objective. By applying A-LIX, we significantly outperform the prior state-of-the-art on the DeepMind Control and Atari benchmarks without any data augmentation or auxiliary losses.}\n}", "pdf": "https://proceedings.mlr.press/v162/cetin22a/cetin22a.pdf", "supp": "", "pdf_size": 9674829, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14839229722928778219&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Centre for Robotics Research, Department of Engineering, King\u2019s College London; Department of Engineering Science, University of Oxford; Department of Engineering Science, University of Oxford; Centre for Robotics Research, Department of Engineering, King\u2019s College London", "aff_domain": "kcl.ac.uk;robots.ox.ac.uk; ;kcl.ac.uk", "email": "kcl.ac.uk;robots.ox.ac.uk; ;kcl.ac.uk", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/cetin22a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "King\u2019s College London;University of Oxford", "aff_unique_dep": "Department of Engineering;Department of Engineering Science", "aff_unique_url": "https://www.kcl.ac.uk;https://www.ox.ac.uk", "aff_unique_abbr": "KCL;Oxford", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Stabilizing Q-learning with Linear Architectures for Provable Efficient Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16469", "id": "16469", "proceeding": "https://proceedings.mlr.press/v162/zanette22a.html", "poster": "", "slides": "", "author_site": "Andrea Zanette, Martin Wainwright", "author": "Andrea Zanette; Martin Wainwright", "abstract": "The Q-learning algorithm is a simple, fundamental and practically very effective reinforcement learning algorithm. However, the basic protocol can exhibit an unstable behavior when implemented even with simple linear function approximation. While tools like target networks and experience replay are often implemented to stabilize the learning process, the individual contribution of each of these mechanisms is not well understood theoretically. This work proposes an exploration variant of the basic Q-learning protocol with linear function approximation. Our modular analysis illustrates the role played by each algorithmic tool that we adopt: a second order update rule, a set of target networks, and a mechanism akin to experience replay. Together, they enable state of the art regret bounds on linear MDPs while preserving the most prominent feature of the algorithm, namely a space complexity independent of the number of steps elapsed. Furthermore, we show that the performance of the algorithm degrades very gracefully under a new, more permissive notion of approximation error. Finally, the algorithm partially inherits problem dependent regret bounds, function of the number of \u2018effective\u2019 feature dimension.", "bibtex": "@InProceedings{pmlr-v162-zanette22a,\n title = \t {Stabilizing Q-learning with Linear Architectures for Provable Efficient Learning},\n author = {Zanette, Andrea and Wainwright, Martin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25920--25954},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zanette22a/zanette22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/zanette22a.html},\n abstract = \t {The Q-learning algorithm is a simple, fundamental and practically very effective reinforcement learning algorithm. However, the basic protocol can exhibit an unstable behavior when implemented even with simple linear function approximation. While tools like target networks and experience replay are often implemented to stabilize the learning process, the individual contribution of each of these mechanisms is not well understood theoretically. This work proposes an exploration variant of the basic Q-learning protocol with linear function approximation. Our modular analysis illustrates the role played by each algorithmic tool that we adopt: a second order update rule, a set of target networks, and a mechanism akin to experience replay. Together, they enable state of the art regret bounds on linear MDPs while preserving the most prominent feature of the algorithm, namely a space complexity independent of the number of steps elapsed. Furthermore, we show that the performance of the algorithm degrades very gracefully under a new, more permissive notion of approximation error. Finally, the algorithm partially inherits problem dependent regret bounds, function of the number of \u2018effective\u2019 feature dimension.}\n}", "pdf": "https://proceedings.mlr.press/v162/zanette22a/zanette22a.pdf", "supp": "", "pdf_size": 472958, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13711195551605204649&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, USA; Department of Electrical Engineering and Computer Sciences and Department of Statistics, University of California, Berkeley, USA", "aff_domain": "berkeley.edu;berkeley.edu", "email": "berkeley.edu;berkeley.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/zanette22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Department of Electrical Engineering and Computer Sciences", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Stable Conformal Prediction Sets", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16841", "id": "16841", "proceeding": "https://proceedings.mlr.press/v162/ndiaye22a.html", "poster": "/media/PosterPDFs/ICML%202022/5ef698cd9fe650923ea331c15af3b160.png?t=1658238267.6432998", "slides": "", "author": "Eugene Ndiaye", "abstract": "When one observes a sequence of variables $(x_1, y_1), \\ldots, (x_n, y_n)$, Conformal Prediction (CP) is a methodology that allows to estimate a confidence set for $y_{n+1}$ given $x_{n+1}$ by merely assuming that the distribution of the data is exchangeable. CP sets have guaranteed coverage for any finite population size $n$. While appealing, the computation of such a set turns out to be infeasible in general, \\eg when the unknown variable $y_{n+1}$ is continuous. The bottleneck is that it is based on a procedure that readjusts a prediction model on data where we replace the unknown target by all its possible values in order to select the most probable one. This requires computing an infinite number of models, which often makes it intractable. In this paper, we combine CP techniques with classical algorithmic stability bounds to derive a prediction set computable with a single model fit. We demonstrate that our proposed confidence set does not lose any coverage guarantees while avoiding the need for data splitting as currently done in the literature. We provide some numerical experiments to illustrate the tightness of our estimation when the sample size is sufficiently large, on both synthetic and real datasets.", "bibtex": "@InProceedings{pmlr-v162-ndiaye22a,\n title = \t {Stable Conformal Prediction Sets},\n author = {Ndiaye, Eugene},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16462--16479},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ndiaye22a/ndiaye22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ndiaye22a.html},\n abstract = \t {When one observes a sequence of variables $(x_1, y_1), \\ldots, (x_n, y_n)$, Conformal Prediction (CP) is a methodology that allows to estimate a confidence set for $y_{n+1}$ given $x_{n+1}$ by merely assuming that the distribution of the data is exchangeable. CP sets have guaranteed coverage for any finite population size $n$. While appealing, the computation of such a set turns out to be infeasible in general, \\eg when the unknown variable $y_{n+1}$ is continuous. The bottleneck is that it is based on a procedure that readjusts a prediction model on data where we replace the unknown target by all its possible values in order to select the most probable one. This requires computing an infinite number of models, which often makes it intractable. In this paper, we combine CP techniques with classical algorithmic stability bounds to derive a prediction set computable with a single model fit. We demonstrate that our proposed confidence set does not lose any coverage guarantees while avoiding the need for data splitting as currently done in the literature. We provide some numerical experiments to illustrate the tightness of our estimation when the sample size is sufficiently large, on both synthetic and real datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/ndiaye22a/ndiaye22a.pdf", "supp": "", "pdf_size": 683506, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1322086183676915267&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "H. Milton Stewart School of Industrial and Systems Engineering, Georgia Institute of Technology, Atlanta, GA, USA", "aff_domain": "gatech.edu", "email": "gatech.edu", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/ndiaye22a.html", "aff_unique_index": "0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "H. Milton Stewart School of Industrial and Systems Engineering", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "0", "aff_campus_unique": "Atlanta", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Staged Training for Transformer Language Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17355", "id": "17355", "proceeding": "https://proceedings.mlr.press/v162/shen22f.html", "poster": "/media/PosterPDFs/ICML%202022/ebbac19a6a88726ff7927a79610bf6be_fdIQPh0.png?t=1657650229.5981507", "slides": "", "author_site": "Sheng Shen, Pete Walsh, Kurt Keutzer, Jesse Dodge, Matthew Peters, Iz Beltagy", "author": "Sheng Shen; Pete Walsh; Kurt Keutzer; Jesse Dodge; Matthew Peters; Iz Beltagy", "abstract": "The current standard approach to scaling transformer language models trains each model size from a different random initialization. As an alternative, we consider a staged training setup that begins with a small model and incrementally increases the amount of compute used for training by applying a \"growth operator\" to increase the model depth and width. By initializing each stage with the output of the previous one, the training process effectively re-uses the compute from prior stages and becomes more efficient. Our growth operators each take as input the entire training state (including model parameters, optimizer state, learning rate schedule, etc.) and output a new training state from which training continues. We identify two important properties of these growth operators, namely that they preserve both the loss and the \u201ctraining dynamics\u201d after applying the operator. While the loss-preserving property has been discussed previously, to the best of our knowledge this work is the first to identify the importance of preserving the training dynamics (the rate of decrease of the loss during training). To find the optimal schedule for stages, we use the scaling laws from (Kaplan et al., 2020) to find a precise schedule that gives the most compute saving by starting a new stage when training efficiency starts decreasing. We empirically validate our growth operators and staged training for autoregressive language models, showing up to 22% compute savings compared to a strong baseline trained from scratch. Our code is available at https://github.com/allenai/staged-training.", "bibtex": "@InProceedings{pmlr-v162-shen22f,\n title = \t {Staged Training for Transformer Language Models},\n author = {Shen, Sheng and Walsh, Pete and Keutzer, Kurt and Dodge, Jesse and Peters, Matthew and Beltagy, Iz},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19893--19908},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shen22f/shen22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/shen22f.html},\n abstract = \t {The current standard approach to scaling transformer language models trains each model size from a different random initialization. As an alternative, we consider a staged training setup that begins with a small model and incrementally increases the amount of compute used for training by applying a \"growth operator\" to increase the model depth and width. By initializing each stage with the output of the previous one, the training process effectively re-uses the compute from prior stages and becomes more efficient. Our growth operators each take as input the entire training state (including model parameters, optimizer state, learning rate schedule, etc.) and output a new training state from which training continues. We identify two important properties of these growth operators, namely that they preserve both the loss and the \u201ctraining dynamics\u201d after applying the operator. While the loss-preserving property has been discussed previously, to the best of our knowledge this work is the first to identify the importance of preserving the training dynamics (the rate of decrease of the loss during training). To find the optimal schedule for stages, we use the scaling laws from (Kaplan et al., 2020) to find a precise schedule that gives the most compute saving by starting a new stage when training efficiency starts decreasing. We empirically validate our growth operators and staged training for autoregressive language models, showing up to 22% compute savings compared to a strong baseline trained from scratch. Our code is available at https://github.com/allenai/staged-training.}\n}", "pdf": "https://proceedings.mlr.press/v162/shen22f/shen22f.pdf", "supp": "", "pdf_size": 688829, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4204701598187830659&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "https://github.com/allenai/staged-training", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/shen22f.html" }, { "title": "State Transition of Dendritic Spines Improves Learning of Sparse Spiking Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17413", "id": "17413", "proceeding": "https://proceedings.mlr.press/v162/chen22ac.html", "poster": "/media/PosterPDFs/ICML%202022/9bf31c7ff062936a96d3c8bd1f8f2ff3.png?t=1656606116.418573", "slides": "/media/icml-2022/Slides/17413.pdf", "author_site": "Yanqi Chen, Zhaofei Yu, Wei Fang, Zhengyu Ma, Tiejun Huang, Yonghong Tian", "author": "Yanqi Chen; Zhaofei Yu; Wei Fang; Zhengyu Ma; Tiejun Huang; Yonghong Tian", "abstract": "Spiking Neural Networks (SNNs) are considered a promising alternative to Artificial Neural Networks (ANNs) for their event-driven computing paradigm when deployed on energy-efficient neuromorphic hardware. Recently, deep SNNs have shown breathtaking performance improvement through cutting-edge training strategy and flexible structure, which also scales up the number of parameters and computational burdens in a single network. Inspired by the state transition of dendritic spines in the filopodial model of spinogenesis, we model different states of SNN weights, facilitating weight optimization for pruning. Furthermore, the pruning speed can be regulated by using different functions describing the growing threshold of state transition. We organize these techniques as a dynamic pruning algorithm based on nonlinear reparameterization mapping from spine size to SNN weights. Our approach yields sparse deep networks on the large-scale dataset (SEW ResNet18 on ImageNet) while maintaining state-of-the-art low performance loss (\u00a03% at 88.8% sparsity) compared to existing pruning methods on directly trained SNNs. Moreover, we find out pruning speed regulation while learning is crucial to avoiding disastrous performance degradation at the final stages of training, which may shed light on future work on SNN pruning.", "bibtex": "@InProceedings{pmlr-v162-chen22ac,\n title = \t {State Transition of Dendritic Spines Improves Learning of Sparse Spiking Neural Networks},\n author = {Chen, Yanqi and Yu, Zhaofei and Fang, Wei and Ma, Zhengyu and Huang, Tiejun and Tian, Yonghong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3701--3715},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22ac/chen22ac.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22ac.html},\n abstract = \t {Spiking Neural Networks (SNNs) are considered a promising alternative to Artificial Neural Networks (ANNs) for their event-driven computing paradigm when deployed on energy-efficient neuromorphic hardware. Recently, deep SNNs have shown breathtaking performance improvement through cutting-edge training strategy and flexible structure, which also scales up the number of parameters and computational burdens in a single network. Inspired by the state transition of dendritic spines in the filopodial model of spinogenesis, we model different states of SNN weights, facilitating weight optimization for pruning. Furthermore, the pruning speed can be regulated by using different functions describing the growing threshold of state transition. We organize these techniques as a dynamic pruning algorithm based on nonlinear reparameterization mapping from spine size to SNN weights. Our approach yields sparse deep networks on the large-scale dataset (SEW ResNet18 on ImageNet) while maintaining state-of-the-art low performance loss (\u00a03% at 88.8% sparsity) compared to existing pruning methods on directly trained SNNs. Moreover, we find out pruning speed regulation while learning is crucial to avoiding disastrous performance degradation at the final stages of training, which may shed light on future work on SNN pruning.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22ac/chen22ac.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chen22ac-supp.zip", "pdf_size": 662752, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2677853348773096259&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "National Engineering Research Center of Visual Technology, School of Computer Science, Peking University + Peng Cheng Laboratory + Institute for Arti\ufb01cial Intelligence, Peking University; National Engineering Research Center of Visual Technology, School of Computer Science, Peking University + Peng Cheng Laboratory + Institute for Arti\ufb01cial Intelligence, Peking University; National Engineering Research Center of Visual Technology, School of Computer Science, Peking University + Peng Cheng Laboratory; Peng Cheng Laboratory + Institute for Arti\ufb01cial Intelligence, Peking University; National Engineering Research Center of Visual Technology, School of Computer Science, Peking University + Peng Cheng Laboratory + Institute for Arti\ufb01cial Intelligence, Peking University; National Engineering Research Center of Visual Technology, School of Computer Science, Peking University + Peng Cheng Laboratory + Institute for Arti\ufb01cial Intelligence, Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn; ;pcl.ac.cn; ;pku.edu.cn", "email": "pku.edu.cn;pku.edu.cn; ;pcl.ac.cn; ;pku.edu.cn", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/chen22ac.html", "aff_unique_index": "0+1+0;0+1+0;0+1;1+0;0+1+0;0+1+0", "aff_unique_norm": "Peking University;Pengcheng Laboratory", "aff_unique_dep": "School of Computer Science;Peng Cheng Laboratory", "aff_unique_url": "http://www.pku.edu.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "PKU;PCL", "aff_campus_unique_index": ";;;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0;0+0;0+0+0;0+0+0", "aff_country_unique": "China" }, { "title": "Statistical inference with implicit SGD: proximal Robbins-Monro vs. Polyak-Ruppert", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17781", "id": "17781", "proceeding": "https://proceedings.mlr.press/v162/lee22f.html", "poster": "/media/PosterPDFs/ICML%202022/b72a5a099433a2099fc3d92f6ad3accf.png?t=1657529181.5253832", "slides": "", "author_site": "Yoonhyung Lee, Sungdong Lee, Joong-Ho (Johann) Won", "author": "Yoonhyung Lee; Sungdong Lee; Joong-Ho Won", "abstract": "The implicit stochastic gradient descent (ISGD), a proximal version of SGD, is gaining interest in the literature due to its stability over (explicit) SGD. In this paper, we conduct an in-depth analysis of the two modes of ISGD for smooth convex functions, namely proximal Robbins-Monro (proxRM) and proximal Poylak-Ruppert (proxPR) procedures, for their use in statistical inference on model parameters. Specifically, we derive non-asymptotic point estimation error bounds of both proxRM and proxPR iterates and their limiting distributions, and propose on-line estimators of their asymptotic covariance matrices that require only a single run of ISGD. The latter estimators are used to construct valid confidence intervals for the model parameters. Our analysis is free of the generalized linear model assumption that has limited the preceding analyses, and employs feasible procedures. Our on-line covariance matrix estimators appear to be the first of this kind in the ISGD literature.", "bibtex": "@InProceedings{pmlr-v162-lee22f,\n title = \t {Statistical inference with implicit {SGD}: proximal Robbins-Monro vs. Polyak-Ruppert},\n author = {Lee, Yoonhyung and Lee, Sungdong and Won, Joong-Ho},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12423--12454},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lee22f/lee22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/lee22f.html},\n abstract = \t {The implicit stochastic gradient descent (ISGD), a proximal version of SGD, is gaining interest in the literature due to its stability over (explicit) SGD. In this paper, we conduct an in-depth analysis of the two modes of ISGD for smooth convex functions, namely proximal Robbins-Monro (proxRM) and proximal Poylak-Ruppert (proxPR) procedures, for their use in statistical inference on model parameters. Specifically, we derive non-asymptotic point estimation error bounds of both proxRM and proxPR iterates and their limiting distributions, and propose on-line estimators of their asymptotic covariance matrices that require only a single run of ISGD. The latter estimators are used to construct valid confidence intervals for the model parameters. Our analysis is free of the generalized linear model assumption that has limited the preceding analyses, and employs feasible procedures. Our on-line covariance matrix estimators appear to be the first of this kind in the ISGD literature.}\n}", "pdf": "https://proceedings.mlr.press/v162/lee22f/lee22f.pdf", "supp": "", "pdf_size": 1115613, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4288879813255762600&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Kakao Entertainment Corp.; Department of Statistics, Seoul National University; Department of Statistics, Seoul National University", "aff_domain": "kakaoentertainment.com;stats.snu.ac.kr;stats.snu.ac.kr", "email": "kakaoentertainment.com;stats.snu.ac.kr;stats.snu.ac.kr", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/lee22f.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Kakao Entertainment Corp.;Seoul National University", "aff_unique_dep": ";Department of Statistics", "aff_unique_url": "https://www.kakao.com;https://www.snu.ac.kr", "aff_unique_abbr": "Kakao Entertainment;SNU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Seoul", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Steerable 3D Spherical Neurons", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18339", "id": "18339", "proceeding": "https://proceedings.mlr.press/v162/melnyk22a.html", "poster": "/media/PosterPDFs/ICML%202022/6fd86e0ad726b778e37cf270fa0247d7_Je6pdiR.png?t=1656353753.2961917", "slides": "/media/icml-2022/Slides/18339_Z8e6XuO.pdf", "author_site": "Pavlo Melnyk, Michael Felsberg, M\u00e5rten Wadenb\u00e4ck", "author": "Pavlo Melnyk; Michael Felsberg; M\u00e5rten Wadenb\u00e4ck", "abstract": "Emerging from low-level vision theory, steerable filters found their counterpart in prior work on steerable convolutional neural networks equivariant to rigid transformations. In our work, we propose a steerable feed-forward learning-based approach that consists of neurons with spherical decision surfaces and operates on point clouds. Such spherical neurons are obtained by conformal embedding of Euclidean space and have recently been revisited in the context of learning representations of point sets. Focusing on 3D geometry, we exploit the isometry property of spherical neurons and derive a 3D steerability constraint. After training spherical neurons to classify point clouds in a canonical orientation, we use a tetrahedron basis to quadruplicate the neurons and construct rotation-equivariant spherical filter banks. We then apply the derived constraint to interpolate the filter bank outputs and, thus, obtain a rotation-invariant network. Finally, we use a synthetic point set and real-world 3D skeleton data to verify our theoretical findings. The code is available at https://github.com/pavlo-melnyk/steerable-3d-neurons.", "bibtex": "@InProceedings{pmlr-v162-melnyk22a,\n title = \t {Steerable 3{D} Spherical Neurons},\n author = {Melnyk, Pavlo and Felsberg, Michael and Wadenb{\\\"a}ck, M{\\aa}rten},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15330--15339},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/melnyk22a/melnyk22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/melnyk22a.html},\n abstract = \t {Emerging from low-level vision theory, steerable filters found their counterpart in prior work on steerable convolutional neural networks equivariant to rigid transformations. In our work, we propose a steerable feed-forward learning-based approach that consists of neurons with spherical decision surfaces and operates on point clouds. Such spherical neurons are obtained by conformal embedding of Euclidean space and have recently been revisited in the context of learning representations of point sets. Focusing on 3D geometry, we exploit the isometry property of spherical neurons and derive a 3D steerability constraint. After training spherical neurons to classify point clouds in a canonical orientation, we use a tetrahedron basis to quadruplicate the neurons and construct rotation-equivariant spherical filter banks. We then apply the derived constraint to interpolate the filter bank outputs and, thus, obtain a rotation-invariant network. Finally, we use a synthetic point set and real-world 3D skeleton data to verify our theoretical findings. The code is available at https://github.com/pavlo-melnyk/steerable-3d-neurons.}\n}", "pdf": "https://proceedings.mlr.press/v162/melnyk22a/melnyk22a.pdf", "supp": "", "pdf_size": 430572, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12172638513685585373&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Computer Vision Laboratory, Department of Electrical Engineering, Link\u00f6ping University, SE-581 83 Link\u00f6ping, Sweden; Computer Vision Laboratory, Department of Electrical Engineering, Link\u00f6ping University, SE-581 83 Link\u00f6ping, Sweden; Computer Vision Laboratory, Department of Electrical Engineering, Link\u00f6ping University, SE-581 83 Link\u00f6ping, Sweden", "aff_domain": "liu.se;liu.se; ", "email": "liu.se;liu.se; ", "github": "https://github.com/pavlo-melnyk/steerable-3d-neurons", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/melnyk22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Link\u00f6ping University", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.liu.se", "aff_unique_abbr": "LiU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Link\u00f6ping", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Sweden" }, { "title": "Stochastic Contextual Dueling Bandits under Linear Stochastic Transitivity Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17607", "id": "17607", "proceeding": "https://proceedings.mlr.press/v162/bengs22a.html", "poster": "", "slides": "", "author_site": "Viktor Bengs, Aadirupa Saha, Eyke H\u00fcllermeier", "author": "Viktor Bengs; Aadirupa Saha; Eyke H\u00fcllermeier", "abstract": "We consider the regret minimization task in a dueling bandits problem with context information. In every round of the sequential decision problem, the learner makes a context-dependent selection of two choice alternatives (arms) to be compared with each other and receives feedback in the form of noisy preference information. We assume that the feedback process is determined by a linear stochastic transitivity model with contextualized utilities (CoLST), and the learner\u2019s task is to include the best arm (with highest latent context-dependent utility) in the duel. We propose a computationally efficient algorithm, \\Algo{CoLSTIM}, which makes its choice based on imitating the feedback process using perturbed context-dependent utility estimates of the underlying CoLST model. If each arm is associated with a $d$-dimensional feature vector, we show that \\Algo{CoLSTIM} achieves a regret of order $\\tilde O( \\sqrt{dT})$ after $T$ learning rounds. Additionally, we also establish the optimality of \\Algo{CoLSTIM} by showing a lower bound for the weak regret that refines the existing average regret analysis. Our experiments demonstrate its superiority over state-of-art algorithms for special cases of CoLST models.", "bibtex": "@InProceedings{pmlr-v162-bengs22a,\n title = \t {Stochastic Contextual Dueling Bandits under Linear Stochastic Transitivity Models},\n author = {Bengs, Viktor and Saha, Aadirupa and H{\\\"u}llermeier, Eyke},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1764--1786},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/bengs22a/bengs22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/bengs22a.html},\n abstract = \t {We consider the regret minimization task in a dueling bandits problem with context information. In every round of the sequential decision problem, the learner makes a context-dependent selection of two choice alternatives (arms) to be compared with each other and receives feedback in the form of noisy preference information. We assume that the feedback process is determined by a linear stochastic transitivity model with contextualized utilities (CoLST), and the learner\u2019s task is to include the best arm (with highest latent context-dependent utility) in the duel. We propose a computationally efficient algorithm, \\Algo{CoLSTIM}, which makes its choice based on imitating the feedback process using perturbed context-dependent utility estimates of the underlying CoLST model. If each arm is associated with a $d$-dimensional feature vector, we show that \\Algo{CoLSTIM} achieves a regret of order $\\tilde O( \\sqrt{dT})$ after $T$ learning rounds. Additionally, we also establish the optimality of \\Algo{CoLSTIM} by showing a lower bound for the weak regret that refines the existing average regret analysis. Our experiments demonstrate its superiority over state-of-art algorithms for special cases of CoLST models.}\n}", "pdf": "https://proceedings.mlr.press/v162/bengs22a/bengs22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/bengs22a-supp.zip", "pdf_size": 3673520, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6098345770522518700&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Institute of Informatics, LMU Munich, Germany; Microsoft Research, New York City, US; Institute of Informatics, LMU Munich, Germany", "aff_domain": "lmu.de; ;lmu.de", "email": "lmu.de; ;lmu.de", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/bengs22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "LMU Munich;Microsoft", "aff_unique_dep": "Institute of Informatics;Microsoft Research", "aff_unique_url": "https://www.lmu.de;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "LMU;MSR", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Munich;New York City", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Germany;United States" }, { "title": "Stochastic Continuous Submodular Maximization: Boosting via Non-oblivious Function", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17305", "id": "17305", "proceeding": "https://proceedings.mlr.press/v162/zhang22e.html", "poster": "", "slides": "", "author_site": "Qixin Zhang, Zengde Deng, Zaiyi Chen, Haoyuan Hu, Yu Yang", "author": "Qixin Zhang; Zengde Deng; Zaiyi Chen; Haoyuan Hu; Yu Yang", "abstract": "In this paper, we revisit Stochastic Continuous Submodular Maximization in both offline and online settings, which can benefit wide applications in machine learning and operations research areas. We present a boosting framework covering gradient ascent and online gradient ascent. The fundamental ingredient of our methods is a novel non-oblivious function $F$ derived from a factor-revealing optimization problem, whose any stationary point provides a $(1-e^{-\\gamma})$-approximation to the global maximum of the $\\gamma$-weakly DR-submodular objective function $f\\in C^{1,1}_L(\\mathcal{X})$. Under the offline scenario, we propose a boosting gradient ascent method achieving $(1-e^{-\\gamma}-\\epsilon^{2})$-approximation after $O(1/\\epsilon^2)$ iterations, which improves the $(\\frac{\\gamma^2}{1+\\gamma^2})$ approximation ratio of the classical gradient ascent algorithm. In the online setting, for the first time we consider the adversarial delays for stochastic gradient feedback, under which we propose a boosting online gradient algorithm with the same non-oblivious function $F$. Meanwhile, we verify that this boosting online algorithm achieves a regret of $O(\\sqrt{D})$ against a $(1-e^{-\\gamma})$-approximation to the best feasible solution in hindsight, where $D$ is the sum of delays of gradient feedback. To the best of our knowledge, this is the first result to obtain $O(\\sqrt{T})$ regret against a $(1-e^{-\\gamma})$-approximation with $O(1)$ gradient inquiry at each time step, when no delay exists, i.e., $D=T$. Finally, numerical experiments demonstrate the effectiveness of our boosting methods.", "bibtex": "@InProceedings{pmlr-v162-zhang22e,\n title = \t {Stochastic Continuous Submodular Maximization: Boosting via Non-oblivious Function},\n author = {Zhang, Qixin and Deng, Zengde and Chen, Zaiyi and Hu, Haoyuan and Yang, Yu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26116--26134},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22e/zhang22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22e.html},\n abstract = \t {In this paper, we revisit Stochastic Continuous Submodular Maximization in both offline and online settings, which can benefit wide applications in machine learning and operations research areas. We present a boosting framework covering gradient ascent and online gradient ascent. The fundamental ingredient of our methods is a novel non-oblivious function $F$ derived from a factor-revealing optimization problem, whose any stationary point provides a $(1-e^{-\\gamma})$-approximation to the global maximum of the $\\gamma$-weakly DR-submodular objective function $f\\in C^{1,1}_L(\\mathcal{X})$. Under the offline scenario, we propose a boosting gradient ascent method achieving $(1-e^{-\\gamma}-\\epsilon^{2})$-approximation after $O(1/\\epsilon^2)$ iterations, which improves the $(\\frac{\\gamma^2}{1+\\gamma^2})$ approximation ratio of the classical gradient ascent algorithm. In the online setting, for the first time we consider the adversarial delays for stochastic gradient feedback, under which we propose a boosting online gradient algorithm with the same non-oblivious function $F$. Meanwhile, we verify that this boosting online algorithm achieves a regret of $O(\\sqrt{D})$ against a $(1-e^{-\\gamma})$-approximation to the best feasible solution in hindsight, where $D$ is the sum of delays of gradient feedback. To the best of our knowledge, this is the first result to obtain $O(\\sqrt{T})$ regret against a $(1-e^{-\\gamma})$-approximation with $O(1)$ gradient inquiry at each time step, when no delay exists, i.e., $D=T$. Finally, numerical experiments demonstrate the effectiveness of our boosting methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22e/zhang22e.pdf", "supp": "", "pdf_size": 511943, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12710098812656828793&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Data Science City University of Hong Kong, Kowloon Hong Kong, China+Cainiao Network, Hang Zhou, China; Cainiao Network, Hang Zhou, China; Cainiao Network, Hang Zhou, China; Cainiao Network, Hang Zhou, China; School of Data Science City University of Hong Kong, Kowloon Hong Kong, China", "aff_domain": "cityu.edu.hk;cainiao.com;cainiao.com;cainiao.com;cityu.edu.hk", "email": "cityu.edu.hk;cainiao.com;cainiao.com;cainiao.com;cityu.edu.hk", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zhang22e.html", "aff_unique_index": "0+1;1;1;1;0", "aff_unique_norm": "City University of Hong Kong;Cainiao Network", "aff_unique_dep": "School of Data Science;", "aff_unique_url": "https://www.cityu.edu.hk;", "aff_unique_abbr": "CityU;", "aff_campus_unique_index": "0+1;1;1;1;0", "aff_campus_unique": "Kowloon;Hang Zhou", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Stochastic Deep Networks with Linear Competing Units for Model-Agnostic Meta-Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15955", "id": "15955", "proceeding": "https://proceedings.mlr.press/v162/kalais22a.html", "poster": "/media/PosterPDFs/ICML%202022/a3c65c2974270fd093ee8a9bf8ae7d0b_XvU1TL6.png?t=1657177983.8262272", "slides": "", "author_site": "Konstantinos Kalais, Sotirios Chatzis", "author": "Konstantinos Kalais; Sotirios Chatzis", "abstract": "This work addresses meta-learning (ML) by considering deep networks with stochastic local winner-takes-all (LWTA) activations. This type of network units results in sparse representations from each model layer, as the units are organized into blocks where only one unit generates a non-zero output. The main operating principle of the introduced units rely on stochastic principles, as the network performs posterior sampling over competing units to select the winner. Therefore, the proposed networks are explicitly designed to extract input data representations of sparse stochastic nature, as opposed to the currently standard deterministic representation paradigm. Our approach produces state-of-the-art predictive accuracy on few-shot image classification and regression experiments, as well as reduced predictive error on an active learning setting; these improvements come with an immensely reduced computational cost. Code is available at: https://github.com/Kkalais/StochLWTA-ML", "bibtex": "@InProceedings{pmlr-v162-kalais22a,\n title = \t {Stochastic Deep Networks with Linear Competing Units for Model-Agnostic Meta-Learning},\n author = {Kalais, Konstantinos and Chatzis, Sotirios},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10586--10597},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kalais22a/kalais22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kalais22a.html},\n abstract = \t {This work addresses meta-learning (ML) by considering deep networks with stochastic local winner-takes-all (LWTA) activations. This type of network units results in sparse representations from each model layer, as the units are organized into blocks where only one unit generates a non-zero output. The main operating principle of the introduced units rely on stochastic principles, as the network performs posterior sampling over competing units to select the winner. Therefore, the proposed networks are explicitly designed to extract input data representations of sparse stochastic nature, as opposed to the currently standard deterministic representation paradigm. Our approach produces state-of-the-art predictive accuracy on few-shot image classification and regression experiments, as well as reduced predictive error on an active learning setting; these improvements come with an immensely reduced computational cost. Code is available at: https://github.com/Kkalais/StochLWTA-ML}\n}", "pdf": "https://proceedings.mlr.press/v162/kalais22a/kalais22a.pdf", "supp": "", "pdf_size": 2199592, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12812982432289049616&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Dept. of Electrical Eng., Computer Eng., and Informatics, Cyprus University of Technology, Limassol, Cyprus; Dept. of Electrical Eng., Computer Eng., and Informatics, Cyprus University of Technology, Limassol, Cyprus", "aff_domain": "cut.ac.cy;cut.ac.cy", "email": "cut.ac.cy;cut.ac.cy", "github": "https://github.com/Kkalais/StochLWTA-ML", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/kalais22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Cyprus University of Technology", "aff_unique_dep": "Dept. of Electrical Eng., Computer Eng., and Informatics", "aff_unique_url": "https://www.cut.ac.cy", "aff_unique_abbr": "CUT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Limassol", "aff_country_unique_index": "0;0", "aff_country_unique": "Cyprus" }, { "title": "Stochastic Reweighted Gradient Descent", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16117", "id": "16117", "proceeding": "https://proceedings.mlr.press/v162/hanchi22a.html", "poster": "", "slides": "", "author_site": "Ayoub El Hanchi, David Stephens, Chris Maddison", "author": "Ayoub El Hanchi; David Stephens; Chris Maddison", "abstract": "Importance sampling is a promising strategy for improving the convergence rate of stochastic gradient methods. It is typically used to precondition the optimization problem, but it can also be used to reduce the variance of the gradient estimator. Unfortunately, this latter point of view has yet to lead to practical methods that provably improve the asymptotic error of stochastic gradient methods. In this work, we propose stochastic reweighted gradient descent (SRG), a stochastic gradient method based solely on importance sampling that can reduce the variance of the gradient estimator and improve on the asymptotic error of stochastic gradient descent (SGD) in the strongly convex and smooth case. We show that SRG can be extended to combine the benefits of both importance-sampling-based preconditioning and variance reduction. When compared to SGD, the resulting algorithm can simultaneously reduce the condition number and the asymptotic error, both by up to a factor equal to the number of component functions. We demonstrate improved convergence in practice on regularized logistic regression problems.", "bibtex": "@InProceedings{pmlr-v162-hanchi22a,\n title = \t {Stochastic Reweighted Gradient Descent},\n author = {Hanchi, Ayoub El and Stephens, David and Maddison, Chris},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8359--8374},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hanchi22a/hanchi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hanchi22a.html},\n abstract = \t {Importance sampling is a promising strategy for improving the convergence rate of stochastic gradient methods. It is typically used to precondition the optimization problem, but it can also be used to reduce the variance of the gradient estimator. Unfortunately, this latter point of view has yet to lead to practical methods that provably improve the asymptotic error of stochastic gradient methods. In this work, we propose stochastic reweighted gradient descent (SRG), a stochastic gradient method based solely on importance sampling that can reduce the variance of the gradient estimator and improve on the asymptotic error of stochastic gradient descent (SGD) in the strongly convex and smooth case. We show that SRG can be extended to combine the benefits of both importance-sampling-based preconditioning and variance reduction. When compared to SGD, the resulting algorithm can simultaneously reduce the condition number and the asymptotic error, both by up to a factor equal to the number of component functions. We demonstrate improved convergence in practice on regularized logistic regression problems.}\n}", "pdf": "https://proceedings.mlr.press/v162/hanchi22a/hanchi22a.pdf", "supp": "", "pdf_size": 865002, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13495144079776331672&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Toronto and Vector Institute; McGill University; University of Toronto and Vector Institute", "aff_domain": "cs.toronto.edu; ; ", "email": "cs.toronto.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/hanchi22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Toronto;McGill University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utoronto.ca;https://www.mcgill.ca", "aff_unique_abbr": "U of T;McGill", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "Stochastic Rising Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16243", "id": "16243", "proceeding": "https://proceedings.mlr.press/v162/metelli22a.html", "poster": "/media/PosterPDFs/ICML%202022/0e7e05fa1026b0c5459267608ae320b8.png?t=1657262270.3060346", "slides": "", "author_site": "Alberto Maria Metelli, Francesco Trov\u00f2, Matteo Pirola, Marcello Restelli", "author": "Alberto Maria Metelli; Francesco Trov\u00f2; Matteo Pirola; Marcello Restelli", "abstract": "This paper is in the field of stochastic Multi-Armed Bandits (MABs), i.e., those sequential selection techniques able to learn online using only the feedback given by the chosen option (a.k.a. arm). We study a particular case of the rested and restless bandits in which the arms\u2019 expected payoff is monotonically non-decreasing. This characteristic allows designing specifically crafted algorithms that exploit the regularity of the payoffs to provide tight regret bounds. We design an algorithm for the rested case (R-ed-UCB) and one for the restless case (R-less-UCB), providing a regret bound depending on the properties of the instance and, under certain circumstances, of $\\widetilde{\\mathcal{O}}(T^{\\frac{2}{3}})$. We empirically compare our algorithms with state-of-the-art methods for non-stationary MABs over several synthetically generated tasks and an online model selection problem for a real-world dataset. Finally, using synthetic and real-world data, we illustrate the effectiveness of the proposed approaches compared with state-of-the-art algorithms for the non-stationary bandits.", "bibtex": "@InProceedings{pmlr-v162-metelli22a,\n title = \t {Stochastic Rising Bandits},\n author = {Metelli, Alberto Maria and Trov{\\`o}, Francesco and Pirola, Matteo and Restelli, Marcello},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15421--15457},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/metelli22a/metelli22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/metelli22a.html},\n abstract = \t {This paper is in the field of stochastic Multi-Armed Bandits (MABs), i.e., those sequential selection techniques able to learn online using only the feedback given by the chosen option (a.k.a. arm). We study a particular case of the rested and restless bandits in which the arms\u2019 expected payoff is monotonically non-decreasing. This characteristic allows designing specifically crafted algorithms that exploit the regularity of the payoffs to provide tight regret bounds. We design an algorithm for the rested case (R-ed-UCB) and one for the restless case (R-less-UCB), providing a regret bound depending on the properties of the instance and, under certain circumstances, of $\\widetilde{\\mathcal{O}}(T^{\\frac{2}{3}})$. We empirically compare our algorithms with state-of-the-art methods for non-stationary MABs over several synthetically generated tasks and an online model selection problem for a real-world dataset. Finally, using synthetic and real-world data, we illustrate the effectiveness of the proposed approaches compared with state-of-the-art algorithms for the non-stationary bandits.}\n}", "pdf": "https://proceedings.mlr.press/v162/metelli22a/metelli22a.pdf", "supp": "", "pdf_size": 881724, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15697580060507911770&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Dipartimento di Elettronica, Informazione e Bioingegneria, Politecnico di Milano, Milan, Italy; Dipartimento di Elettronica, Informazione e Bioingegneria, Politecnico di Milano, Milan, Italy; Dipartimento di Elettronica, Informazione e Bioingegneria, Politecnico di Milano, Milan, Italy; Dipartimento di Elettronica, Informazione e Bioingegneria, Politecnico di Milano, Milan, Italy", "aff_domain": "polimi.it; ; ; ", "email": "polimi.it; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/metelli22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Politecnico di Milano", "aff_unique_dep": "Dipartimento di Elettronica, Informazione e Bioingegneria", "aff_unique_url": "https://www.polimi.it", "aff_unique_abbr": "Politecnico di Milano", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Milan", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Italy" }, { "title": "Stochastic smoothing of the top-K calibrated hinge loss for deep imbalanced classification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17929", "id": "17929", "proceeding": "https://proceedings.mlr.press/v162/garcin22a.html", "poster": "/media/PosterPDFs/ICML%202022/dc36f18a9a0a776671d4879cae69b551.png?t=1657718295.3330126", "slides": "", "author_site": "Camille Garcin, Maximilien Servajean, Alexis Joly, Joseph Salmon", "author": "Camille Garcin; Maximilien Servajean; Alexis Joly; Joseph Salmon", "abstract": "In modern classification tasks, the number of labels is getting larger and larger, as is the size of the datasets encountered in practice. As the number of classes increases, class ambiguity and class imbalance become more and more problematic to achieve high top-1 accuracy. Meanwhile, Top-K metrics (metrics allowing K guesses) have become popular, especially for performance reporting. Yet, proposing top-K losses tailored for deep learning remains a challenge, both theoretically and practically. In this paper we introduce a stochastic top-K hinge loss inspired by recent developments on top-K calibrated losses. Our proposal is based on the smoothing of the top-K operator building on the flexible \"perturbed optimizer\" framework. We show that our loss function performs very well in the case of balanced datasets, while benefiting from a significantly lower computational time than the state-of-the-art top-K loss function. In addition, we propose a simple variant of our loss for the imbalanced case. Experiments on a heavy-tailed dataset show that our loss function significantly outperforms other baseline loss functions.", "bibtex": "@InProceedings{pmlr-v162-garcin22a,\n title = \t {Stochastic smoothing of the top-K calibrated hinge loss for deep imbalanced classification},\n author = {Garcin, Camille and Servajean, Maximilien and Joly, Alexis and Salmon, Joseph},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7208--7222},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/garcin22a/garcin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/garcin22a.html},\n abstract = \t {In modern classification tasks, the number of labels is getting larger and larger, as is the size of the datasets encountered in practice. As the number of classes increases, class ambiguity and class imbalance become more and more problematic to achieve high top-1 accuracy. Meanwhile, Top-K metrics (metrics allowing K guesses) have become popular, especially for performance reporting. Yet, proposing top-K losses tailored for deep learning remains a challenge, both theoretically and practically. In this paper we introduce a stochastic top-K hinge loss inspired by recent developments on top-K calibrated losses. Our proposal is based on the smoothing of the top-K operator building on the flexible \"perturbed optimizer\" framework. We show that our loss function performs very well in the case of balanced datasets, while benefiting from a significantly lower computational time than the state-of-the-art top-K loss function. In addition, we propose a simple variant of our loss for the imbalanced case. Experiments on a heavy-tailed dataset show that our loss function significantly outperforms other baseline loss functions.}\n}", "pdf": "https://proceedings.mlr.press/v162/garcin22a/garcin22a.pdf", "supp": "", "pdf_size": 3067641, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16642060329776900644&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "IMAG, Univ Montpellier, CNRS, Montpellier, France+Inria, LIRMM, Univ Montpellier, CNRS, Montpellier, France; LIRMM, Univ Montpellier, CNRS, Montpellier, France+AMIS, Paul Valery University, Montpellier, France; Inria, LIRMM, Univ Montpellier, CNRS, Montpellier, France; Institut Universitaire de France (IUF)", "aff_domain": "umontpellier.fr; ;inria.fr;umontpellier.fr", "email": "umontpellier.fr; ;inria.fr;umontpellier.fr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/garcin22a.html", "aff_unique_index": "0+1;2+3;1;4", "aff_unique_norm": "University of Montpellier;INRIA;Laboratoire d'Informatique, de Robotique et de Micro\u00e9lectronique de Montpellier;Paul Valery University;Institut Universitaire de France", "aff_unique_dep": "IMAG;;Department of Computer Science;AMIS;", "aff_unique_url": "https://www.univ-montp1.fr;https://www.inria.fr;https://www.lirmm.fr;https://www.univ-montp3.fr;https://www.iuf.cnrs.fr", "aff_unique_abbr": "Univ Montpellier;Inria;LIRMM;;IUF", "aff_campus_unique_index": "0;0+0", "aff_campus_unique": "Montpellier;", "aff_country_unique_index": "0+0;0+0;0;0", "aff_country_unique": "France" }, { "title": "Strategic Instrumental Variable Regression: Recovering Causal Relationships From Strategic Responses", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16381", "id": "16381", "proceeding": "https://proceedings.mlr.press/v162/harris22a.html", "poster": "/media/PosterPDFs/ICML%202022/4f6ffe13a5d75b2d6a3923922b3922e5.png?t=1657203779.8412898", "slides": "", "author_site": "Keegan Harris, Dung Ngo, Logan Stapleton, Hoda Heidari, Steven Wu", "author": "Keegan Harris; Dung Daniel T Ngo; Logan Stapleton; Hoda Heidari; Steven Wu", "abstract": "In settings where Machine Learning (ML) algorithms automate or inform consequential decisions about people, individual decision subjects are often incentivized to strategically modify their observable attributes to receive more favorable predictions. As a result, the distribution the assessment rule is trained on may differ from the one it operates on in deployment. While such distribution shifts, in general, can hinder accurate predictions, our work identifies a unique opportunity associated with shifts due to strategic responses: We show that we can use strategic responses effectively to recover causal relationships between the observable features and outcomes we wish to predict, even under the presence of unobserved confounding variables. Specifically, our work establishes a novel connection between strategic responses to ML models and instrumental variable (IV) regression by observing that the sequence of deployed models can be viewed as an instrument that affects agents\u2019 observable features but does not directly influence their outcomes. We show that our causal recovery method can be utilized to improve decision-making across several important criteria: individual fairness, agent outcomes, and predictive risk. In particular, we show that if decision subjects differ in their ability to modify non-causal attributes, any decision rule deviating from the causal coefficients can lead to (potentially unbounded) individual-level unfairness. .", "bibtex": "@InProceedings{pmlr-v162-harris22a,\n title = \t {Strategic Instrumental Variable Regression: Recovering Causal Relationships From Strategic Responses},\n author = {Harris, Keegan and Ngo, Dung Daniel T and Stapleton, Logan and Heidari, Hoda and Wu, Steven},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8502--8522},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/harris22a/harris22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/harris22a.html},\n abstract = \t {In settings where Machine Learning (ML) algorithms automate or inform consequential decisions about people, individual decision subjects are often incentivized to strategically modify their observable attributes to receive more favorable predictions. As a result, the distribution the assessment rule is trained on may differ from the one it operates on in deployment. While such distribution shifts, in general, can hinder accurate predictions, our work identifies a unique opportunity associated with shifts due to strategic responses: We show that we can use strategic responses effectively to recover causal relationships between the observable features and outcomes we wish to predict, even under the presence of unobserved confounding variables. Specifically, our work establishes a novel connection between strategic responses to ML models and instrumental variable (IV) regression by observing that the sequence of deployed models can be viewed as an instrument that affects agents\u2019 observable features but does not directly influence their outcomes. We show that our causal recovery method can be utilized to improve decision-making across several important criteria: individual fairness, agent outcomes, and predictive risk. In particular, we show that if decision subjects differ in their ability to modify non-causal attributes, any decision rule deviating from the causal coefficients can lead to (potentially unbounded) individual-level unfairness. .}\n}", "pdf": "https://proceedings.mlr.press/v162/harris22a/harris22a.pdf", "supp": "", "pdf_size": 2295734, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5426296166892217767&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "School of Computer Science, Carnegie Mellon University, Pittsburgh, USA+Computer Science Department, University of Minnesota, Minneapolis, USA; Computer Science Department, University of Minnesota, Minneapolis, USA; Computer Science Department, University of Minnesota, Minneapolis, USA; School of Computer Science, Carnegie Mellon University, Pittsburgh, USA; School of Computer Science, Carnegie Mellon University, Pittsburgh, USA", "aff_domain": "cs.cmu.edu; ; ; ; ", "email": "cs.cmu.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/harris22a.html", "aff_unique_index": "0+1;1;1;0;0", "aff_unique_norm": "Carnegie Mellon University;University of Minnesota", "aff_unique_dep": "School of Computer Science;Computer Science Department", "aff_unique_url": "https://www.cmu.edu;https://www.umn.edu", "aff_unique_abbr": "CMU;UMN", "aff_campus_unique_index": "0+1;1;1;0;0", "aff_campus_unique": "Pittsburgh;Minneapolis", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Strategic Representation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17611", "id": "17611", "proceeding": "https://proceedings.mlr.press/v162/nair22a.html", "poster": "/media/PosterPDFs/ICML%202022/335f5352088d7d9bf74191e006d8e24c.png?t=1657710380.8801599", "slides": "", "author_site": "Vineet Nair, Ganesh Ghalme, Inbal Talgam-Cohen, Nir Rosenfeld", "author": "Vineet Nair; Ganesh Ghalme; Inbal Talgam-Cohen; Nir Rosenfeld", "abstract": "Humans have come to rely on machines for reducing excessive information to manageable representations. But this reliance can be abused \u2013 strategic machines might craft representations that manipulate their users. How can a user make good choices based on strategic representations? We formalize this as a learning problem, and pursue algorithms for decision-making that are robust to manipulation. In our main setting of interest, the system represents attributes of an item to the user, who then decides whether or not to consume. We model this interaction through the lens of strategic classification (Hardt et al. 2016), reversed: the user, who learns, plays first; and the system, which responds, plays second. The system must respond with representations that reveal \u2018nothing but the truth\u2019 but need not reveal the entire truth. Thus, the user faces the problem of learning set functions under strategic subset selection, which presents distinct algorithmic and statistical challenges. Our main result is a learning algorithm that minimizes error despite strategic representations, and our theoretical analysis sheds light on the trade-off between learning effort and susceptibility to manipulation.", "bibtex": "@InProceedings{pmlr-v162-nair22a,\n title = \t {Strategic Representation},\n author = {Nair, Vineet and Ghalme, Ganesh and Talgam-Cohen, Inbal and Rosenfeld, Nir},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16331--16352},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nair22a/nair22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nair22a.html},\n abstract = \t {Humans have come to rely on machines for reducing excessive information to manageable representations. But this reliance can be abused \u2013 strategic machines might craft representations that manipulate their users. How can a user make good choices based on strategic representations? We formalize this as a learning problem, and pursue algorithms for decision-making that are robust to manipulation. In our main setting of interest, the system represents attributes of an item to the user, who then decides whether or not to consume. We model this interaction through the lens of strategic classification (Hardt et al. 2016), reversed: the user, who learns, plays first; and the system, which responds, plays second. The system must respond with representations that reveal \u2018nothing but the truth\u2019 but need not reveal the entire truth. Thus, the user faces the problem of learning set functions under strategic subset selection, which presents distinct algorithmic and statistical challenges. Our main result is a learning algorithm that minimizes error despite strategic representations, and our theoretical analysis sheds light on the trade-off between learning effort and susceptibility to manipulation.}\n}", "pdf": "https://proceedings.mlr.press/v162/nair22a/nair22a.pdf", "supp": "", "pdf_size": 444711, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11814543740429647454&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Technion \u2013 Israel Institute of Technology; Indian Institute of Technology, Hyderabad; Technion \u2013 Israel Institute of Technology; Technion \u2013 Israel Institute of Technology", "aff_domain": "cs.technion.ac.il; ; ; ", "email": "cs.technion.ac.il; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/nair22a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Technion \u2013 Israel Institute of Technology;Indian Institute of Technology Hyderabad", "aff_unique_dep": ";", "aff_unique_url": "https://www.technion.ac.il/en/;https://www.iith.ac.in", "aff_unique_abbr": "Technion;IIT Hyderabad", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hyderabad", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Israel;India" }, { "title": "Strategies for Safe Multi-Armed Bandits with Logarithmic Regret and Risk", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16293", "id": "16293", "proceeding": "https://proceedings.mlr.press/v162/chen22e.html", "poster": "/media/PosterPDFs/ICML%202022/32fcc8cfe1fa4c77b5c58dafd36d1a98.png?t=1657921263.3549857", "slides": "", "author_site": "Tianrui Chen, Aditya Gangrade, Venkatesh Saligrama", "author": "Tianrui Chen; Aditya Gangrade; Venkatesh Saligrama", "abstract": "We investigate a natural but surprisingly unstudied approach to the multi-armed bandit problem under safety risk constraints. Each arm is associated with an unknown law on safety risks and rewards, and the learner\u2019s goal is to maximise reward whilst not playing unsafe arms, as determined by a given threshold on the mean risk. We formulate a pseudo-regret for this setting that enforces this safety constraint in a per-round way by softly penalising any violation, regardless of the gain in reward due to the same. This has practical relevance to scenarios such as clinical trials, where one must maintain safety for each round rather than in an aggregated sense. We describe doubly optimistic strategies for this scenario, which maintain optimistic indices for both safety risk and reward. We show that schema based on both frequentist and Bayesian indices satisfy tight gap-dependent logarithmic regret bounds, and further that these play unsafe arms only logarithmically many times in total. This theoretical analysis is complemented by simulation studies demonstrating the effectiveness of the proposed schema, and probing the domains in which their use is appropriate.", "bibtex": "@InProceedings{pmlr-v162-chen22e,\n title = \t {Strategies for Safe Multi-Armed Bandits with Logarithmic Regret and Risk},\n author = {Chen, Tianrui and Gangrade, Aditya and Saligrama, Venkatesh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3123--3148},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22e/chen22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22e.html},\n abstract = \t {We investigate a natural but surprisingly unstudied approach to the multi-armed bandit problem under safety risk constraints. Each arm is associated with an unknown law on safety risks and rewards, and the learner\u2019s goal is to maximise reward whilst not playing unsafe arms, as determined by a given threshold on the mean risk. We formulate a pseudo-regret for this setting that enforces this safety constraint in a per-round way by softly penalising any violation, regardless of the gain in reward due to the same. This has practical relevance to scenarios such as clinical trials, where one must maintain safety for each round rather than in an aggregated sense. We describe doubly optimistic strategies for this scenario, which maintain optimistic indices for both safety risk and reward. We show that schema based on both frequentist and Bayesian indices satisfy tight gap-dependent logarithmic regret bounds, and further that these play unsafe arms only logarithmically many times in total. This theoretical analysis is complemented by simulation studies demonstrating the effectiveness of the proposed schema, and probing the domains in which their use is appropriate.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22e/chen22e.pdf", "supp": "", "pdf_size": 1481648, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7693704710834962775&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Boston University; Carnegie Mellon University; Boston University", "aff_domain": "andrew.cmu.edu; ; ", "email": "andrew.cmu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22e.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Boston University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.bu.edu;https://www.cmu.edu", "aff_unique_abbr": "BU;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Streaming Algorithm for Monotone k-Submodular Maximization with Cardinality Constraints", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16613", "id": "16613", "proceeding": "https://proceedings.mlr.press/v162/ene22a.html", "poster": "/media/PosterPDFs/ICML%202022/13d7dc096493e1f77fb4ccf3eaf79df1.png?t=1657649297.0646305", "slides": "", "author_site": "Alina Ene, Huy Nguyen", "author": "Alina Ene; Huy Nguyen", "abstract": "Maximizing a monotone k-submodular function subject to cardinality constraints is a general model for several applications ranging from influence maximization with multiple products to sensor placement with multiple sensor types and online ad allocation. Due to the large problem scale in many applications and the online nature of ad allocation, a need arises for algorithms that process elements in a streaming fashion and possibly make online decisions. In this work, we develop a new streaming algorithm for maximizing a monotone k-submodular function subject to a per-coordinate cardinality constraint attaining an approximation guarantee close to the state of the art guarantee in the offline setting. Though not typical for streaming algorithms, our streaming algorithm also readily applies to the online setting with free disposal. Our algorithm is combinatorial and enjoys fast running time and small number of function evaluations. Furthermore, its guarantee improves as the cardinality constraints get larger, which is especially suited for the large scale applications. For the special case of maximizing a submodular function with large budgets, our combinatorial algorithm matches the guarantee of the state-of-the-art continuous algorithm, which requires significantly more time and function evaluations.", "bibtex": "@InProceedings{pmlr-v162-ene22a,\n title = \t {Streaming Algorithm for Monotone k-Submodular Maximization with Cardinality Constraints},\n author = {Ene, Alina and Nguyen, Huy},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5944--5967},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ene22a/ene22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ene22a.html},\n abstract = \t {Maximizing a monotone k-submodular function subject to cardinality constraints is a general model for several applications ranging from influence maximization with multiple products to sensor placement with multiple sensor types and online ad allocation. Due to the large problem scale in many applications and the online nature of ad allocation, a need arises for algorithms that process elements in a streaming fashion and possibly make online decisions. In this work, we develop a new streaming algorithm for maximizing a monotone k-submodular function subject to a per-coordinate cardinality constraint attaining an approximation guarantee close to the state of the art guarantee in the offline setting. Though not typical for streaming algorithms, our streaming algorithm also readily applies to the online setting with free disposal. Our algorithm is combinatorial and enjoys fast running time and small number of function evaluations. Furthermore, its guarantee improves as the cardinality constraints get larger, which is especially suited for the large scale applications. For the special case of maximizing a submodular function with large budgets, our combinatorial algorithm matches the guarantee of the state-of-the-art continuous algorithm, which requires significantly more time and function evaluations.}\n}", "pdf": "https://proceedings.mlr.press/v162/ene22a/ene22a.pdf", "supp": "", "pdf_size": 2915661, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6892361167827331843&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, Boston University; Khoury College of Computer and Information Science, Northeastern University", "aff_domain": "bu.edu;northeastern.edu", "email": "bu.edu;northeastern.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/ene22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Boston University;Northeastern University", "aff_unique_dep": "Department of Computer Science;Khoury College of Computer and Information Science", "aff_unique_url": "https://www.bu.edu;https://www.northeastern.edu", "aff_unique_abbr": "BU;NU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Streaming Algorithms for High-Dimensional Robust Statistics", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16883", "id": "16883", "proceeding": "https://proceedings.mlr.press/v162/diakonikolas22a.html", "poster": "/media/PosterPDFs/ICML%202022/993edc98ca87f7e08494eec37fa836f7.png?t=1657817822.3727074", "slides": "/media/icml-2022/Slides/16883.pdf", "author_site": "Ilias Diakonikolas, Daniel Kane, Ankit Pensia, Thanasis Pittas", "author": "Ilias Diakonikolas; Daniel M. Kane; Ankit Pensia; Thanasis Pittas", "abstract": "We study high-dimensional robust statistics tasks in the streaming model. A recent line of work obtained computationally efficient algorithms for a range of high-dimensional robust statistics tasks. Unfortunately, all previous algorithms require storing the entire dataset, incurring memory at least quadratic in the dimension. In this work, we develop the first efficient streaming algorithms for high-dimensional robust statistics with near-optimal memory requirements (up to logarithmic factors). Our main result is for the task of high-dimensional robust mean estimation in (a strengthening of) Huber\u2019s contamination model. We give an efficient single-pass streaming algorithm for this task with near-optimal error guarantees and space complexity nearly-linear in the dimension. As a corollary, we obtain streaming algorithms with near-optimal space complexity for several more complex tasks, including robust covariance estimation, robust regression, and more generally robust stochastic optimization.", "bibtex": "@InProceedings{pmlr-v162-diakonikolas22a,\n title = \t {Streaming Algorithms for High-Dimensional Robust Statistics},\n author = {Diakonikolas, Ilias and Kane, Daniel M. and Pensia, Ankit and Pittas, Thanasis},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5061--5117},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/diakonikolas22a/diakonikolas22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/diakonikolas22a.html},\n abstract = \t {We study high-dimensional robust statistics tasks in the streaming model. A recent line of work obtained computationally efficient algorithms for a range of high-dimensional robust statistics tasks. Unfortunately, all previous algorithms require storing the entire dataset, incurring memory at least quadratic in the dimension. In this work, we develop the first efficient streaming algorithms for high-dimensional robust statistics with near-optimal memory requirements (up to logarithmic factors). Our main result is for the task of high-dimensional robust mean estimation in (a strengthening of) Huber\u2019s contamination model. We give an efficient single-pass streaming algorithm for this task with near-optimal error guarantees and space complexity nearly-linear in the dimension. As a corollary, we obtain streaming algorithms with near-optimal space complexity for several more complex tasks, including robust covariance estimation, robust regression, and more generally robust stochastic optimization.}\n}", "pdf": "https://proceedings.mlr.press/v162/diakonikolas22a/diakonikolas22a.pdf", "supp": "", "pdf_size": 745144, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8718836476348016307&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of Wisconsin-Madison; University of California, San Diego; University of Wisconsin-Madison; University of Wisconsin-Madison", "aff_domain": "cs.wisc.edu; ;wisc.edu; ", "email": "cs.wisc.edu; ;wisc.edu; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/diakonikolas22a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Wisconsin-Madison;University of California, San Diego", "aff_unique_dep": ";", "aff_unique_url": "https://www.wisc.edu;https://www.ucsd.edu", "aff_unique_abbr": "UW-Madison;UCSD", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Madison;San Diego", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Streaming Algorithms for Support-Aware Histograms", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16501", "id": "16501", "proceeding": "https://proceedings.mlr.press/v162/chen22g.html", "poster": "/media/PosterPDFs/ICML%202022/2ea1202aed1e0ce30d41be4919b0cc99.png?t=1658103853.4117136", "slides": "", "author_site": "Justin Chen, Piotr Indyk, Tal Wagner", "author": "Justin Chen; Piotr Indyk; Tal Wagner", "abstract": "Histograms, i.e., piece-wise constant approximations, are a popular tool used to represent data distributions. Traditionally, the difference between the histogram and the underlying distribution (i.e., the approximation error) is measured using the L_p norm, which sums the differences between the two functions over all items in the domain. Although useful in many applications, the drawback of this error measure is that it treats approximation errors of all items in the same way, irrespective of whether the mass of an item is important for the downstream application that uses the approximation. As a result, even relatively simple distributions cannot be approximated by succinct histograms without incurring large error. In this paper, we address this issue by adapting the definition of approximation so that only the errors of the items that belong to the support of the distribution are considered. Under this definition, we develop efficient 1-pass and 2-pass streaming algorithms that compute near-optimal histograms in sub-linear space. We also present lower bounds on the space complexity of this problem. Surprisingly, under this notion of error, there is an exponential gap in the space complexity of 1-pass and 2-pass streaming algorithms. Finally, we demonstrate the utility of our algorithms on a collection of real and synthetic data sets.", "bibtex": "@InProceedings{pmlr-v162-chen22g,\n title = \t {Streaming Algorithms for Support-Aware Histograms},\n author = {Chen, Justin and Indyk, Piotr and Wagner, Tal},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3184--3203},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22g/chen22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22g.html},\n abstract = \t {Histograms, i.e., piece-wise constant approximations, are a popular tool used to represent data distributions. Traditionally, the difference between the histogram and the underlying distribution (i.e., the approximation error) is measured using the L_p norm, which sums the differences between the two functions over all items in the domain. Although useful in many applications, the drawback of this error measure is that it treats approximation errors of all items in the same way, irrespective of whether the mass of an item is important for the downstream application that uses the approximation. As a result, even relatively simple distributions cannot be approximated by succinct histograms without incurring large error. In this paper, we address this issue by adapting the definition of approximation so that only the errors of the items that belong to the support of the distribution are considered. Under this definition, we develop efficient 1-pass and 2-pass streaming algorithms that compute near-optimal histograms in sub-linear space. We also present lower bounds on the space complexity of this problem. Surprisingly, under this notion of error, there is an exponential gap in the space complexity of 1-pass and 2-pass streaming algorithms. Finally, we demonstrate the utility of our algorithms on a collection of real and synthetic data sets.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22g/chen22g.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chen22g-supp.zip", "pdf_size": 511630, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4554193783250193127&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "MIT, Cambridge, MA, USA; MIT, Cambridge, MA, USA; Microsoft Research, Redmond, WA, USA", "aff_domain": "mit.edu; ; ", "email": "mit.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22g.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://web.mit.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MIT;MSR", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Cambridge;Redmond", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Streaming Inference for Infinite Feature Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17081", "id": "17081", "proceeding": "https://proceedings.mlr.press/v162/schaeffer22a.html", "poster": "", "slides": "", "author_site": "Rylan Schaeffer, Yilun Du, Gabrielle K Liu, Ila R. Fiete", "author": "Rylan Schaeffer; Yilun Du; Gabrielle K Liu; Ila Fiete", "abstract": "Unsupervised learning from a continuous stream of data is arguably one of the most common and most challenging problems facing intelligent agents. One class of unsupervised models, collectively termed", "bibtex": "@InProceedings{pmlr-v162-schaeffer22a,\n title = \t {Streaming Inference for Infinite Feature Models},\n author = {Schaeffer, Rylan and Du, Yilun and Liu, Gabrielle K and Fiete, Ila},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19366--19387},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/schaeffer22a/schaeffer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/schaeffer22a.html},\n abstract = \t {Unsupervised learning from a continuous stream of data is arguably one of the most common and most challenging problems facing intelligent agents. One class of unsupervised models, collectively termed", "pdf": "https://proceedings.mlr.press/v162/schaeffer22a/schaeffer22a.pdf", "supp": "", "pdf_size": 3397129, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7941781939967671142&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Computer Science, Stanford University + Brain and Cognitive Sciences, MIT; Electrical Engineering and Computer Science, MIT; Brain and Cognitive Sciences, MIT; Brain and Cognitive Sciences, MIT + McGovern Institute for Brain Research, MIT", "aff_domain": "cs.stanford.edu; ; ; ", "email": "cs.stanford.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/schaeffer22a.html", "aff_unique_index": "0+1;1;1;1+1", "aff_unique_norm": "Stanford University;Massachusetts Institute of Technology", "aff_unique_dep": "Computer Science;Department of Brain and Cognitive Sciences", "aff_unique_url": "https://www.stanford.edu;https://web.mit.edu", "aff_unique_abbr": "Stanford;MIT", "aff_campus_unique_index": "0+1;1;1;1+1", "aff_campus_unique": "Stanford;Cambridge", "aff_country_unique_index": "0+0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "StreamingQA: A Benchmark for Adaptation to New Knowledge over Time in Question Answering Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17993", "id": "17993", "proceeding": "https://proceedings.mlr.press/v162/liska22a.html", "poster": "/media/PosterPDFs/ICML%202022/885cb47f87718a2cd8641ae79113eeea.png?t=1657884862.765441", "slides": "", "author_site": "Adam Liska, Tomas Kocisky, Elena Gribovskaya, Tayfun Terzi, Eren Sezener, Devang Agrawal, Cyprien de Masson d'Autume, Tim Scholtes, Manzil Zaheer, Susannah Young, Ellen Gilsenan-McMahon, Sophia Austin, Phil Blunsom, Angeliki Lazaridou", "author": "Adam Liska; Tomas Kocisky; Elena Gribovskaya; Tayfun Terzi; Eren Sezener; Devang Agrawal; Cyprien De Masson D\u2019Autume; Tim Scholtes; Manzil Zaheer; Susannah Young; Ellen Gilsenan-Mcmahon; Sophia Austin; Phil Blunsom; Angeliki Lazaridou", "abstract": "Knowledge and language understanding of models evaluated through question answering (QA) has been usually studied on static snapshots of knowledge, like Wikipedia. However, our world is dynamic, evolves over time, and our models\u2019 knowledge becomes outdated. To study how semi-parametric QA models and their underlying parametric language models (LMs) adapt to evolving knowledge, we construct a new large-scale dataset, StreamingQA, with human written and generated questions asked on a given date, to be answered from 14 years of time-stamped news articles. We evaluate our models quarterly as they read new articles not seen in pre-training. We show that parametric models can be updated without full retraining, while avoiding catastrophic forgetting. For semi-parametric models, adding new articles into the search space allows for rapid adaptation, however, models with an outdated underlying LM under-perform those with a retrained LM. For questions about higher-frequency named entities, parametric updates are particularly beneficial. In our dynamic world, the StreamingQA dataset enables a more realistic evaluation of QA models, and our experiments highlight several promising directions for future research.", "bibtex": "@InProceedings{pmlr-v162-liska22a,\n title = \t {{S}treaming{QA}: A Benchmark for Adaptation to New Knowledge over Time in Question Answering Models},\n author = {Liska, Adam and Kocisky, Tomas and Gribovskaya, Elena and Terzi, Tayfun and Sezener, Eren and Agrawal, Devang and De Masson D'Autume, Cyprien and Scholtes, Tim and Zaheer, Manzil and Young, Susannah and Gilsenan-Mcmahon, Ellen and Austin, Sophia and Blunsom, Phil and Lazaridou, Angeliki},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13604--13622},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liska22a/liska22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/liska22a.html},\n abstract = \t {Knowledge and language understanding of models evaluated through question answering (QA) has been usually studied on static snapshots of knowledge, like Wikipedia. However, our world is dynamic, evolves over time, and our models\u2019 knowledge becomes outdated. To study how semi-parametric QA models and their underlying parametric language models (LMs) adapt to evolving knowledge, we construct a new large-scale dataset, StreamingQA, with human written and generated questions asked on a given date, to be answered from 14 years of time-stamped news articles. We evaluate our models quarterly as they read new articles not seen in pre-training. We show that parametric models can be updated without full retraining, while avoiding catastrophic forgetting. For semi-parametric models, adding new articles into the search space allows for rapid adaptation, however, models with an outdated underlying LM under-perform those with a retrained LM. For questions about higher-frequency named entities, parametric updates are particularly beneficial. In our dynamic world, the StreamingQA dataset enables a more realistic evaluation of QA models, and our experiments highlight several promising directions for future research.}\n}", "pdf": "https://proceedings.mlr.press/v162/liska22a/liska22a.pdf", "supp": "", "pdf_size": 847652, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14847402247915330134&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "aff": "Glyphic AI, work done at DeepMind; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; Glyphic AI, work done at DeepMind; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; Cohere, work done at DeepMind+Department of Computer Science, University of Oxford, Oxford, UK; DeepMind, London, UK", "aff_domain": "deepmind.com;deepmind.com; ; ; ; ; ; ; ; ; ; ; ;", "email": "deepmind.com;deepmind.com; ; ; ; ; ; ; ; ; ; ; ;", "github": "", "project": "", "author_num": 14, "oa": "https://proceedings.mlr.press/v162/liska22a.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0+1;0", "aff_unique_norm": "DeepMind;University of Oxford", "aff_unique_dep": "DeepMind;Department of Computer Science", "aff_unique_url": "https://deepmind.com;https://www.ox.ac.uk", "aff_unique_abbr": "DeepMind;Oxford", "aff_campus_unique_index": "1;1;1;1;1;1;1;1;1;1;2;1", "aff_campus_unique": ";London;Oxford", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0+0;0", "aff_country_unique": "United Kingdom" }, { "title": "Structural Entropy Guided Graph Hierarchical Pooling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15959", "id": "15959", "proceeding": "https://proceedings.mlr.press/v162/wu22b.html", "poster": "/media/PosterPDFs/ICML%202022/9dcb88e0137649590b755372b040afad.png?t=1655089467.0944345", "slides": "", "author_site": "Junran Wu, Xueyuan Chen, Ke Xu, Shangzhe Li", "author": "Junran Wu; Xueyuan Chen; Ke Xu; Shangzhe Li", "abstract": "Following the success of convolution on non-Euclidean space, the corresponding pooling approaches have also been validated on various tasks regarding graphs. However, because of the fixed compression ratio and stepwise pooling design, these hierarchical pooling methods still suffer from local structure damage and suboptimal problem. In this work, inspired by structural entropy, we propose a hierarchical pooling approach, SEP, to tackle the two issues. Specifically, without assigning the layer-specific compression ratio, a global optimization algorithm is designed to generate the cluster assignment matrices for pooling at once. Then, we present an illustration of the local structure damage from previous methods in reconstruction of ring and grid synthetic graphs. In addition to SEP, we further design two classification models, SEP-G and SEP-N for graph classification and node classification, respectively. The results show that SEP outperforms state-of-the-art graph pooling methods on graph classification benchmarks and obtains superior performance on node classifications.", "bibtex": "@InProceedings{pmlr-v162-wu22b,\n title = \t {Structural Entropy Guided Graph Hierarchical Pooling},\n author = {Wu, Junran and Chen, Xueyuan and Xu, Ke and Li, Shangzhe},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24017--24030},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22b/wu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22b.html},\n abstract = \t {Following the success of convolution on non-Euclidean space, the corresponding pooling approaches have also been validated on various tasks regarding graphs. However, because of the fixed compression ratio and stepwise pooling design, these hierarchical pooling methods still suffer from local structure damage and suboptimal problem. In this work, inspired by structural entropy, we propose a hierarchical pooling approach, SEP, to tackle the two issues. Specifically, without assigning the layer-specific compression ratio, a global optimization algorithm is designed to generate the cluster assignment matrices for pooling at once. Then, we present an illustration of the local structure damage from previous methods in reconstruction of ring and grid synthetic graphs. In addition to SEP, we further design two classification models, SEP-G and SEP-N for graph classification and node classification, respectively. The results show that SEP outperforms state-of-the-art graph pooling methods on graph classification benchmarks and obtains superior performance on node classifications.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22b/wu22b.pdf", "supp": "", "pdf_size": 1242326, "gs_citation": 103, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15391796189805731538&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "State Key Lab of Software Development Environment, Beihang University, Beijing, 100191, China; State Key Lab of Software Development Environment, Beihang University, Beijing, 100191, China; State Key Lab of Software Development Environment, Beihang University, Beijing, 100191, China; School of Mathematical Science, Beihang University, Beijing, 100191, China", "aff_domain": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn", "email": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn;buaa.edu.cn", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/wu22b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Beihang University", "aff_unique_dep": "State Key Lab of Software Development Environment", "aff_unique_url": "http://www.buaa.edu.cn", "aff_unique_abbr": "BUAA", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Structure Preserving Neural Networks: A Case Study in the Entropy Closure of the Boltzmann Equation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15979", "id": "15979", "proceeding": "https://proceedings.mlr.press/v162/schotthofer22a.html", "poster": "/media/PosterPDFs/ICML%202022/8562ae5e286544710b2e7ebe9858833b.png?t=1657264681.7108467", "slides": "", "author_site": "Steffen Schotth\u00f6fer, Tianbai Xiao, Martin Frank, Cory Hauck", "author": "Steffen Schotth\u00f6fer; Tianbai Xiao; Martin Frank; Cory Hauck", "abstract": "In this paper, we explore applications of deep learning in statistical physics. We choose the Boltzmann equation as a typical example, where neural networks serve as a closure to its moment system. We present two types of neural networks to embed the convexity of entropy and to preserve the minimum entropy principle and intrinsic mathematical structures of the moment system of the Boltzmann equation. We derive an error bound for the generalization gap of convex neural networks which are trained in Sobolev norm and use the results to construct data sampling methods for neural network training. Numerical experiments demonstrate that the neural entropy closure is significantly faster than classical optimizers while maintaining sufficient accuracy.", "bibtex": "@InProceedings{pmlr-v162-schotthofer22a,\n title = \t {Structure Preserving Neural Networks: A Case Study in the Entropy Closure of the Boltzmann Equation},\n author = {Schotth{\\\"o}fer, Steffen and Xiao, Tianbai and Frank, Martin and Hauck, Cory},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19406--19433},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/schotthofer22a/schotthofer22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/schotthofer22a.html},\n abstract = \t {In this paper, we explore applications of deep learning in statistical physics. We choose the Boltzmann equation as a typical example, where neural networks serve as a closure to its moment system. We present two types of neural networks to embed the convexity of entropy and to preserve the minimum entropy principle and intrinsic mathematical structures of the moment system of the Boltzmann equation. We derive an error bound for the generalization gap of convex neural networks which are trained in Sobolev norm and use the results to construct data sampling methods for neural network training. Numerical experiments demonstrate that the neural entropy closure is significantly faster than classical optimizers while maintaining sufficient accuracy.}\n}", "pdf": "https://proceedings.mlr.press/v162/schotthofer22a/schotthofer22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/schotthofer22a-supp.zip", "pdf_size": 8366825, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16054545270129321214&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Applied and Numerical Mathematics, Karlsruhe Institute of Technology, Karlsruhe, Germany; Department of Applied and Numerical Mathematics, Karlsruhe Institute of Technology, Karlsruhe, Germany; Department of Applied and Numerical Mathematics, Karlsruhe Institute of Technology, Karlsruhe, Germany; Computer Science and Mathematics Division, Oak Ridge National Laboratory, Oak Ridge, TN, USA + Department of Mathematics (Joint Faculty), University of Tennessee, Knoxville, TN, USA", "aff_domain": "kit.edu; ; ; ", "email": "kit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/schotthofer22a.html", "aff_unique_index": "0;0;0;1+2", "aff_unique_norm": "Karlsruhe Institute of Technology;Oak Ridge National Laboratory;University of Tennessee", "aff_unique_dep": "Department of Applied and Numerical Mathematics;Computer Science and Mathematics Division;Department of Mathematics", "aff_unique_url": "https://www.kit.edu;https://www.ornl.gov;https://www.utk.edu", "aff_unique_abbr": "KIT;ORNL;UT", "aff_campus_unique_index": "0;0;0;1+2", "aff_campus_unique": "Karlsruhe;Oak Ridge;Knoxville", "aff_country_unique_index": "0;0;0;1+1", "aff_country_unique": "Germany;United States" }, { "title": "Structure-Aware Transformer for Graph Representation Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17987", "id": "17987", "proceeding": "https://proceedings.mlr.press/v162/chen22r.html", "poster": "/media/PosterPDFs/ICML%202022/2a0f97f81755e2878b264adf39cba68e.png?t=1657484946.7296402", "slides": "", "author_site": "Dexiong Chen, Leslie O'Bray, Karsten Borgwardt", "author": "Dexiong Chen; Leslie O\u2019Bray; Karsten Borgwardt", "abstract": "The Transformer architecture has gained growing attention in graph representation learning recently, as it naturally overcomes several limitations of graph neural networks (GNNs) by avoiding their strict structural inductive biases and instead only encoding the graph structure via positional encoding. Here, we show that the node representations generated by the Transformer with positional encoding do not necessarily capture structural similarity between them. To address this issue, we propose the Structure-Aware Transformer, a class of simple and flexible graph Transformers built upon a new self-attention mechanism. This new self-attention incorporates structural information into the original self-attention by extracting a subgraph representation rooted at each node before computing the attention. We propose several methods for automatically generating the subgraph representation and show theoretically that the resulting representations are at least as expressive as the subgraph representations. Empirically, our method achieves state-of-the-art performance on five graph prediction benchmarks. Our structure-aware framework can leverage any existing GNN to extract the subgraph representation, and we show that it systematically improves performance relative to the base GNN model, successfully combining the advantages of GNNs and Transformers. Our code is available at https://github.com/BorgwardtLab/SAT.", "bibtex": "@InProceedings{pmlr-v162-chen22r,\n title = \t {Structure-Aware Transformer for Graph Representation Learning},\n author = {Chen, Dexiong and O'Bray, Leslie and Borgwardt, Karsten},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3469--3489},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22r/chen22r.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22r.html},\n abstract = \t {The Transformer architecture has gained growing attention in graph representation learning recently, as it naturally overcomes several limitations of graph neural networks (GNNs) by avoiding their strict structural inductive biases and instead only encoding the graph structure via positional encoding. Here, we show that the node representations generated by the Transformer with positional encoding do not necessarily capture structural similarity between them. To address this issue, we propose the Structure-Aware Transformer, a class of simple and flexible graph Transformers built upon a new self-attention mechanism. This new self-attention incorporates structural information into the original self-attention by extracting a subgraph representation rooted at each node before computing the attention. We propose several methods for automatically generating the subgraph representation and show theoretically that the resulting representations are at least as expressive as the subgraph representations. Empirically, our method achieves state-of-the-art performance on five graph prediction benchmarks. Our structure-aware framework can leverage any existing GNN to extract the subgraph representation, and we show that it systematically improves performance relative to the base GNN model, successfully combining the advantages of GNNs and Transformers. Our code is available at https://github.com/BorgwardtLab/SAT.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22r/chen22r.pdf", "supp": "", "pdf_size": 539479, "gs_citation": 383, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4875324713433840142&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Biosystems Science and Engineering, ETH Z\u00fcrich, Switzerland + SIB Swiss Institute of Bioinformatics, Switzerland; Department of Biosystems Science and Engineering, ETH Z\u00fcrich, Switzerland + SIB Swiss Institute of Bioinformatics, Switzerland; Department of Biosystems Science and Engineering, ETH Z\u00fcrich, Switzerland + SIB Swiss Institute of Bioinformatics, Switzerland", "aff_domain": "bsse.ethz.ch;bsse.ethz.ch; ", "email": "bsse.ethz.ch;bsse.ethz.ch; ", "github": "https://github.com/BorgwardtLab/SAT", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22r.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "ETH Zurich;Swiss Institute of Bioinformatics", "aff_unique_dep": "Department of Biosystems Science and Engineering;", "aff_unique_url": "https://www.ethz.ch;https://www.sib.swiss", "aff_unique_abbr": "ETH;SIB", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "Switzerland" }, { "title": "Structure-preserving GANs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16379", "id": "16379", "proceeding": "https://proceedings.mlr.press/v162/birrell22a.html", "poster": "/media/PosterPDFs/ICML%202022/96671501524948bc3937b4b30d0e57b9.png?t=1657202231.1112118", "slides": "", "author_site": "Jeremiah Birrell, Markos Katsoulakis, Luc Rey-Bellet, Wei Zhu", "author": "Jeremiah Birrell; Markos Katsoulakis; Luc Rey-Bellet; Wei Zhu", "abstract": "Generative adversarial networks (GANs), a class of distribution-learning methods based on a two-player game between a generator and a discriminator, can generally be formulated as a minmax problem based on the variational representation of a divergence between the unknown and the generated distributions. We introduce structure-preserving GANs as a data-efficient framework for learning distributions with additional structure such as group symmetry, by developing new variational representations for divergences. Our theory shows that we can reduce the discriminator space to its projection on the invariant discriminator space, using the conditional expectation with respect to the sigma-algebra associated to the underlying structure. In addition, we prove that the discriminator space reduction must be accompanied by a careful design of structured generators, as flawed designs may easily lead to a catastrophic \u201cmode collapse\u201d of the learned distribution. We contextualize our framework by building symmetry-preserving GANs for distributions with intrinsic group symmetry, and demonstrate that both players, namely the equivariant generator and invariant discriminator, play important but distinct roles in the learning process. Empirical experiments and ablation studies across a broad range of data sets, including real-world medical imaging, validate our theory, and show our proposed methods achieve significantly improved sample fidelity and diversity\u2014almost an order of magnitude measured in Frechet Inception Distance\u2014especially in the small data regime.", "bibtex": "@InProceedings{pmlr-v162-birrell22a,\n title = \t {Structure-preserving {GAN}s},\n author = {Birrell, Jeremiah and Katsoulakis, Markos and Rey-Bellet, Luc and Zhu, Wei},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1982--2020},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/birrell22a/birrell22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/birrell22a.html},\n abstract = \t {Generative adversarial networks (GANs), a class of distribution-learning methods based on a two-player game between a generator and a discriminator, can generally be formulated as a minmax problem based on the variational representation of a divergence between the unknown and the generated distributions. We introduce structure-preserving GANs as a data-efficient framework for learning distributions with additional structure such as group symmetry, by developing new variational representations for divergences. Our theory shows that we can reduce the discriminator space to its projection on the invariant discriminator space, using the conditional expectation with respect to the sigma-algebra associated to the underlying structure. In addition, we prove that the discriminator space reduction must be accompanied by a careful design of structured generators, as flawed designs may easily lead to a catastrophic \u201cmode collapse\u201d of the learned distribution. We contextualize our framework by building symmetry-preserving GANs for distributions with intrinsic group symmetry, and demonstrate that both players, namely the equivariant generator and invariant discriminator, play important but distinct roles in the learning process. Empirical experiments and ablation studies across a broad range of data sets, including real-world medical imaging, validate our theory, and show our proposed methods achieve significantly improved sample fidelity and diversity\u2014almost an order of magnitude measured in Frechet Inception Distance\u2014especially in the small data regime.}\n}", "pdf": "https://proceedings.mlr.press/v162/birrell22a/birrell22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/birrell22a-supp.zip", "pdf_size": 14063713, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14018356317806469563&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Mathematics and Statistics, University of Massachusetts Amherst, Amherst, MA 01003, USA; Department of Mathematics and Statistics, University of Massachusetts Amherst, Amherst, MA 01003, USA; Department of Mathematics and Statistics, University of Massachusetts Amherst, Amherst, MA 01003, USA; Department of Mathematics and Statistics, University of Massachusetts Amherst, Amherst, MA 01003, USA", "aff_domain": "math.umass.edu; ; ; ", "email": "math.umass.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/birrell22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Massachusetts Amherst", "aff_unique_dep": "Department of Mathematics and Statistics", "aff_unique_url": "https://www.umass.edu", "aff_unique_abbr": "UMass Amherst", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Structured Stochastic Gradient MCMC", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16899", "id": "16899", "proceeding": "https://proceedings.mlr.press/v162/alexos22a.html", "poster": "/media/PosterPDFs/ICML%202022/280cf18baf4311c92aa5a042336587d3.png?t=1657394152.1664944", "slides": "", "author_site": "Antonios Alexos, Alex Boyd, Stephan Mandt", "author": "Antonios Alexos; Alex J Boyd; Stephan Mandt", "abstract": "Stochastic gradient Markov Chain Monte Carlo (SGMCMC) is a scalable algorithm for asymptotically exact Bayesian inference in parameter-rich models, such as Bayesian neural networks. However, since mixing can be slow in high dimensions, practitioners often resort to variational inference (VI). Unfortunately, VI makes strong assumptions on both the factorization and functional form of the posterior. To relax these assumptions, this work proposes a new non-parametric variational inference scheme that combines ideas from both SGMCMC and coordinate-ascent VI. The approach relies on a new Langevin-type algorithm that operates on a \"self-averaged\" posterior energy function, where parts of the latent variables are averaged over samples from earlier iterations of the Markov chain. This way, statistical dependencies between coordinates can be broken in a controlled way, allowing the chain to mix faster. This scheme can be further modified in a \"dropout\" manner, leading to even more scalability. We test our scheme for ResNet-20 on CIFAR-10, SVHN, and FMNIST. In all cases, we find improvements in convergence speed and/or final accuracy compared to SGMCMC and parametric VI.", "bibtex": "@InProceedings{pmlr-v162-alexos22a,\n title = \t {Structured Stochastic Gradient {MCMC}},\n author = {Alexos, Antonios and Boyd, Alex J and Mandt, Stephan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {414--434},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/alexos22a/alexos22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/alexos22a.html},\n abstract = \t {Stochastic gradient Markov Chain Monte Carlo (SGMCMC) is a scalable algorithm for asymptotically exact Bayesian inference in parameter-rich models, such as Bayesian neural networks. However, since mixing can be slow in high dimensions, practitioners often resort to variational inference (VI). Unfortunately, VI makes strong assumptions on both the factorization and functional form of the posterior. To relax these assumptions, this work proposes a new non-parametric variational inference scheme that combines ideas from both SGMCMC and coordinate-ascent VI. The approach relies on a new Langevin-type algorithm that operates on a \"self-averaged\" posterior energy function, where parts of the latent variables are averaged over samples from earlier iterations of the Markov chain. This way, statistical dependencies between coordinates can be broken in a controlled way, allowing the chain to mix faster. This scheme can be further modified in a \"dropout\" manner, leading to even more scalability. We test our scheme for ResNet-20 on CIFAR-10, SVHN, and FMNIST. In all cases, we find improvements in convergence speed and/or final accuracy compared to SGMCMC and parametric VI.}\n}", "pdf": "https://proceedings.mlr.press/v162/alexos22a/alexos22a.pdf", "supp": "", "pdf_size": 915149, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8097612641869986343&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, University of California, Irvine, USA+Department of Statistics, University of California, Irvine, USA; Department of Computer Science, University of California, Irvine, USA+Department of Statistics, University of California, Irvine, USA; Department of Computer Science, University of California, Irvine, USA+Department of Statistics, University of California, Irvine, USA", "aff_domain": "uci.edu;uci.edu; ", "email": "uci.edu;uci.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/alexos22a.html", "aff_unique_index": "0+0;0+0;0+0", "aff_unique_norm": "University of California, Irvine", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.uci.edu", "aff_unique_abbr": "UCI", "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Irvine", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "Style Equalization: Unsupervised Learning of Controllable Generative Sequence Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16199", "id": "16199", "proceeding": "https://proceedings.mlr.press/v162/chang22a.html", "poster": "/media/PosterPDFs/ICML%202022/7e6ff0205749bc6025b51155e26f6ced.png?t=1656344974.9138517", "slides": "", "author_site": "Jen-Hao Rick Chang, Ashish Shrivastava, Hema Koppula, Xiaoshuai Zhang, Oncel Tuzel", "author": "Jen-Hao Rick Chang; Ashish Shrivastava; Hema Koppula; Xiaoshuai Zhang; Oncel Tuzel", "abstract": "Controllable generative sequence models with the capability to extract and replicate the style of specific examples enable many applications, including narrating audiobooks in different voices, auto-completing and auto-correcting written handwriting, and generating missing training samples for downstream recognition tasks. However, under an unsupervised-style setting, typical training algorithms for controllable sequence generative models suffer from the training-inference mismatch, where the same sample is used as content and style input during training but unpaired samples are given during inference. In this paper, we tackle the training-inference mismatch encountered during unsupervised learning of controllable generative sequence models. The proposed method is simple yet effective, where we use a style transformation module to transfer target style information into an unrelated style input. This method enables training using unpaired content and style samples and thereby mitigate the training-inference mismatch. We apply style equalization to text-to-speech and text-to-handwriting synthesis on three datasets. We conduct thorough evaluation, including both quantitative and qualitative user studies. Our results show that by mitigating the training-inference mismatch with the proposed style equalization, we achieve style replication scores comparable to real data in our user studies.", "bibtex": "@InProceedings{pmlr-v162-chang22a,\n title = \t {Style Equalization: Unsupervised Learning of Controllable Generative Sequence Models},\n author = {Chang, Jen-Hao Rick and Shrivastava, Ashish and Koppula, Hema and Zhang, Xiaoshuai and Tuzel, Oncel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2917--2937},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chang22a/chang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chang22a.html},\n abstract = \t {Controllable generative sequence models with the capability to extract and replicate the style of specific examples enable many applications, including narrating audiobooks in different voices, auto-completing and auto-correcting written handwriting, and generating missing training samples for downstream recognition tasks. However, under an unsupervised-style setting, typical training algorithms for controllable sequence generative models suffer from the training-inference mismatch, where the same sample is used as content and style input during training but unpaired samples are given during inference. In this paper, we tackle the training-inference mismatch encountered during unsupervised learning of controllable generative sequence models. The proposed method is simple yet effective, where we use a style transformation module to transfer target style information into an unrelated style input. This method enables training using unpaired content and style samples and thereby mitigate the training-inference mismatch. We apply style equalization to text-to-speech and text-to-handwriting synthesis on three datasets. We conduct thorough evaluation, including both quantitative and qualitative user studies. Our results show that by mitigating the training-inference mismatch with the proposed style equalization, we achieve style replication scores comparable to real data in our user studies.}\n}", "pdf": "https://proceedings.mlr.press/v162/chang22a/chang22a.pdf", "supp": "", "pdf_size": 2668461, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9338198586933521323&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Apple; Apple; Apple; University of California, San Diego; Apple", "aff_domain": "apple.com; ; ; ; ", "email": "apple.com; ; ; ; ", "github": "https://apple.github.io/ml-style-equalization", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/chang22a.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Apple;University of California, San Diego", "aff_unique_dep": "Apple Inc.;", "aff_unique_url": "https://www.apple.com;https://www.ucsd.edu", "aff_unique_abbr": "Apple;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sublinear-Time Clustering Oracle for Signed Graphs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16037", "id": "16037", "proceeding": "https://proceedings.mlr.press/v162/neumann22a.html", "poster": "/media/PosterPDFs/ICML%202022/1c6a0198177bfcc9bd93f6aab94aad3c.png?t=1657703985.4779203", "slides": "", "author_site": "Stefan Neumann, Pan Peng", "author": "Stefan Neumann; Pan Peng", "abstract": "Social networks are often modeled using signed graphs, where vertices correspond to users and edges have a sign that indicates whether an interaction between users was positive or negative. The arising signed graphs typically contain a clear community structure in the sense that the graph can be partitioned into a small number of polarized communities, each defining a sparse cut and indivisible into smaller polarized sub-communities. We provide a local clustering oracle for signed graphs with such a clear community structure, that can answer membership queries, i.e., \u201cGiven a vertex\u00a0$v$, which community does\u00a0$v$ belong to?\u201d, in sublinear time by reading only a small portion of the graph. Formally, when the graph has bounded maximum degree and the number of communities is at most $O(\\log n)$, then with $\\tilde{O}(\\sqrt{n}\\operatorname{poly}(1/\\varepsilon))$ preprocessing time, our oracle can answer each membership query in $\\tilde{O}(\\sqrt{n}\\operatorname{poly}(1/\\varepsilon))$ time, and it correctly classifies a $(1-\\varepsilon)$-fraction of vertices w.r.t. a set of hidden planted ground-truth communities. Our oracle is desirable in applications where the clustering information is needed for only a small number of vertices. Previously, such local clustering oracles were only known for unsigned graphs; our generalization to signed graphs requires a number of new ideas and gives a novel spectral analysis of the behavior of random walks with signs. We evaluate our algorithm for constructing such an oracle and answering membership queries on both synthetic and real-world datasets, validating its performance in practice.", "bibtex": "@InProceedings{pmlr-v162-neumann22a,\n title = \t {Sublinear-Time Clustering Oracle for Signed Graphs},\n author = {Neumann, Stefan and Peng, Pan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16496--16528},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/neumann22a/neumann22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/neumann22a.html},\n abstract = \t {Social networks are often modeled using signed graphs, where vertices correspond to users and edges have a sign that indicates whether an interaction between users was positive or negative. The arising signed graphs typically contain a clear community structure in the sense that the graph can be partitioned into a small number of polarized communities, each defining a sparse cut and indivisible into smaller polarized sub-communities. We provide a local clustering oracle for signed graphs with such a clear community structure, that can answer membership queries, i.e., \u201cGiven a vertex\u00a0$v$, which community does\u00a0$v$ belong to?\u201d, in sublinear time by reading only a small portion of the graph. Formally, when the graph has bounded maximum degree and the number of communities is at most $O(\\log n)$, then with $\\tilde{O}(\\sqrt{n}\\operatorname{poly}(1/\\varepsilon))$ preprocessing time, our oracle can answer each membership query in $\\tilde{O}(\\sqrt{n}\\operatorname{poly}(1/\\varepsilon))$ time, and it correctly classifies a $(1-\\varepsilon)$-fraction of vertices w.r.t. a set of hidden planted ground-truth communities. Our oracle is desirable in applications where the clustering information is needed for only a small number of vertices. Previously, such local clustering oracles were only known for unsigned graphs; our generalization to signed graphs requires a number of new ideas and gives a novel spectral analysis of the behavior of random walks with signs. We evaluate our algorithm for constructing such an oracle and answering membership queries on both synthetic and real-world datasets, validating its performance in practice.}\n}", "pdf": "https://proceedings.mlr.press/v162/neumann22a/neumann22a.pdf", "supp": "", "pdf_size": 673677, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11680644385251401321&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": ";", "aff_domain": ";", "email": ";", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/neumann22a.html" }, { "title": "Subspace Learning for Effective Meta-Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18137", "id": "18137", "proceeding": "https://proceedings.mlr.press/v162/jiang22b.html", "poster": "/media/PosterPDFs/ICML%202022/6b493230205f780e1bc26945df7481e5.png?t=1657531755.3734248", "slides": "", "author_site": "Weisen JIANG, James Kwok, Yu Zhang", "author": "Weisen Jiang; James Kwok; Yu Zhang", "abstract": "Meta-learning aims to extract meta-knowledge from historical tasks to accelerate learning on new tasks. Typical meta-learning algorithms like MAML learn a globally-shared meta-model for all tasks. However, when the task environments are complex, task model parameters are diverse and a common meta-model is insufficient to capture all the meta-knowledge. To address this challenge, in this paper, task model parameters are structured into multiple subspaces, and each subspace represents one type of meta-knowledge. We propose an algorithm to learn the meta-parameters (\\ie, subspace bases). We theoretically study the generalization properties of the learned subspaces. Experiments on regression and classification meta-learning datasets verify the effectiveness of the proposed algorithm.", "bibtex": "@InProceedings{pmlr-v162-jiang22b,\n title = \t {Subspace Learning for Effective Meta-Learning},\n author = {Jiang, Weisen and Kwok, James and Zhang, Yu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10177--10194},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jiang22b/jiang22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/jiang22b.html},\n abstract = \t {Meta-learning aims to extract meta-knowledge from historical tasks to accelerate learning on new tasks. Typical meta-learning algorithms like MAML learn a globally-shared meta-model for all tasks. However, when the task environments are complex, task model parameters are diverse and a common meta-model is insufficient to capture all the meta-knowledge. To address this challenge, in this paper, task model parameters are structured into multiple subspaces, and each subspace represents one type of meta-knowledge. We propose an algorithm to learn the meta-parameters (\\ie, subspace bases). We theoretically study the generalization properties of the learned subspaces. Experiments on regression and classification meta-learning datasets verify the effectiveness of the proposed algorithm.}\n}", "pdf": "https://proceedings.mlr.press/v162/jiang22b/jiang22b.pdf", "supp": "", "pdf_size": 1571226, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16175077761776861601&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Guangdong Provincial Key Laboratory of Brain-inspired Intelligent Computation, Department of Computer Science and Engineering, Southern University of Science and Technology+Peng Cheng Laboratory; Department of Computer Science and Engineering, Hong Kong University of Science and Technology; Guangdong Provincial Key Laboratory of Brain-inspired Intelligent Computation, Department of Computer Science and Engineering, Southern University of Science and Technology+Peng Cheng Laboratory", "aff_domain": "sustech.edu.cn;ust.hk;gmail.com", "email": "sustech.edu.cn;ust.hk;gmail.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/jiang22b.html", "aff_unique_index": "0+1;2;0+1", "aff_unique_norm": "Southern University of Science and Technology;Pengcheng Laboratory;Hong Kong University of Science and Technology", "aff_unique_dep": "Department of Computer Science and Engineering;Peng Cheng Laboratory;Department of Computer Science and Engineering", "aff_unique_url": "https://www.sustech.edu.cn;http://www.pcl.ac.cn;https://www.ust.hk", "aff_unique_abbr": "SUSTech;PCL;HKUST", "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0+0;0;0+0", "aff_country_unique": "China" }, { "title": "Supervised Learning with General Risk Functionals", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16765", "id": "16765", "proceeding": "https://proceedings.mlr.press/v162/leqi22a.html", "poster": "/media/PosterPDFs/ICML%202022/c5b2cebf15b205503560c4e8e6d1ea78.png?t=1658092903.0145352", "slides": "", "author_site": "Liu Leqi, Audrey Huang, Zachary Lipton, Kamyar Azizzadenesheli", "author": "Liu Leqi; Audrey Huang; Zachary Lipton; Kamyar Azizzadenesheli", "abstract": "Standard uniform convergence results bound the generalization gap of the expected loss over a hypothesis class. The emergence of risk-sensitive learning requires generalization guarantees for functionals of the loss distribution beyond the expectation. While prior works specialize in uniform convergence of particular functionals, our work provides uniform convergence for a general class of H\u00f6lder risk functionals for which the closeness in the Cumulative Distribution Function (CDF) entails closeness in risk. We establish the first uniform convergence results for estimating the CDF of the loss distribution, which yield uniform convergence guarantees that hold simultaneously both over a class of H\u00f6lder risk functionals and over a hypothesis class. Thus licensed to perform empirical risk minimization, we develop practical gradient-based methods for minimizing distortion risks (widely studied subset of H\u00f6lder risks that subsumes the spectral risks, including the mean, conditional value at risk, cumulative prospect theory risks, and others) and provide convergence guarantees. In experiments, we demonstrate the efficacy of our learning procedure, both in settings where uniform convergence results hold and in high-dimensional settings with deep networks.", "bibtex": "@InProceedings{pmlr-v162-leqi22a,\n title = \t {Supervised Learning with General Risk Functionals},\n author = {Leqi, Liu and Huang, Audrey and Lipton, Zachary and Azizzadenesheli, Kamyar},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12570--12592},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/leqi22a/leqi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/leqi22a.html},\n abstract = \t {Standard uniform convergence results bound the generalization gap of the expected loss over a hypothesis class. The emergence of risk-sensitive learning requires generalization guarantees for functionals of the loss distribution beyond the expectation. While prior works specialize in uniform convergence of particular functionals, our work provides uniform convergence for a general class of H\u00f6lder risk functionals for which the closeness in the Cumulative Distribution Function (CDF) entails closeness in risk. We establish the first uniform convergence results for estimating the CDF of the loss distribution, which yield uniform convergence guarantees that hold simultaneously both over a class of H\u00f6lder risk functionals and over a hypothesis class. Thus licensed to perform empirical risk minimization, we develop practical gradient-based methods for minimizing distortion risks (widely studied subset of H\u00f6lder risks that subsumes the spectral risks, including the mean, conditional value at risk, cumulative prospect theory risks, and others) and provide convergence guarantees. In experiments, we demonstrate the efficacy of our learning procedure, both in settings where uniform convergence results hold and in high-dimensional settings with deep networks.}\n}", "pdf": "https://proceedings.mlr.press/v162/leqi22a/leqi22a.pdf", "supp": "", "pdf_size": 579571, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9595505786575743348&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": "Machine Learning Department, Carnegie Mellon University; Department of Computer Science, University of Illinois Urbana-Champaign; Machine Learning Department, Carnegie Mellon University; Department of Computer Science, Purdue University", "aff_domain": "cs.cmu.edu;illinois.edu; ; ", "email": "cs.cmu.edu;illinois.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/leqi22a.html", "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Carnegie Mellon University;University of Illinois Urbana-Champaign;Purdue University", "aff_unique_dep": "Machine Learning Department;Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.cmu.edu;https://illinois.edu;https://www.purdue.edu", "aff_unique_abbr": "CMU;UIUC;Purdue", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Supervised Off-Policy Ranking", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18343", "id": "18343", "proceeding": "https://proceedings.mlr.press/v162/jin22f.html", "poster": "/media/PosterPDFs/ICML%202022/eb46c61f91aab8c2b002b288485fc118_Cjgm5dl.png?t=1657444129.0013735", "slides": "", "author_site": "Yue Jin, Yue Zhang, Tao Qin, Xudong Zhang, Jian Yuan, Houqiang Li, Tie-Yan Liu", "author": "Yue Jin; Yue Zhang; Tao Qin; Xudong Zhang; Jian Yuan; Houqiang Li; Tie-Yan Liu", "abstract": "Off-policy evaluation (OPE) is to evaluate a target policy with data generated by other policies. Most previous OPE methods focus on precisely estimating the true performance of a policy. We observe that in many applications, (1) the end goal of OPE is to compare two or multiple candidate policies and choose a good one, which is a much simpler task than precisely evaluating their true performance; and (2) there are usually multiple policies that have been deployed to serve users in real-world systems and thus the true performance of these policies can be known. Inspired by the two observations, in this work, we study a new problem, supervised off-policy ranking (SOPR), which aims to rank a set of target policies based on supervised learning by leveraging off-policy data and policies with known performance. We propose a method to solve SOPR, which learns a policy scoring model by minimizing a ranking loss of the training policies rather than estimating the precise policy performance. The scoring model in our method, a hierarchical Transformer based model, maps a set of state-action pairs to a score, where the state of each pair comes from the off-policy data and the action is taken by a target policy on the state in an offline manner. Extensive experiments on public datasets show that our method outperforms baseline methods in terms of rank correlation, regret value, and stability. Our code is publicly available at GitHub.", "bibtex": "@InProceedings{pmlr-v162-jin22f,\n title = \t {Supervised Off-Policy Ranking},\n author = {Jin, Yue and Zhang, Yue and Qin, Tao and Zhang, Xudong and Yuan, Jian and Li, Houqiang and Liu, Tie-Yan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10323--10339},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jin22f/jin22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/jin22f.html},\n abstract = \t {Off-policy evaluation (OPE) is to evaluate a target policy with data generated by other policies. Most previous OPE methods focus on precisely estimating the true performance of a policy. We observe that in many applications, (1) the end goal of OPE is to compare two or multiple candidate policies and choose a good one, which is a much simpler task than precisely evaluating their true performance; and (2) there are usually multiple policies that have been deployed to serve users in real-world systems and thus the true performance of these policies can be known. Inspired by the two observations, in this work, we study a new problem, supervised off-policy ranking (SOPR), which aims to rank a set of target policies based on supervised learning by leveraging off-policy data and policies with known performance. We propose a method to solve SOPR, which learns a policy scoring model by minimizing a ranking loss of the training policies rather than estimating the precise policy performance. The scoring model in our method, a hierarchical Transformer based model, maps a set of state-action pairs to a score, where the state of each pair comes from the off-policy data and the action is taken by a target policy on the state in an offline manner. Extensive experiments on public datasets show that our method outperforms baseline methods in terms of rank correlation, regret value, and stability. Our code is publicly available at GitHub.}\n}", "pdf": "https://proceedings.mlr.press/v162/jin22f/jin22f.pdf", "supp": "", "pdf_size": 36261209, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12930957527069555602&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Department of Electronic Engineering, Tsinghua University; Department of Electronic Engineering and Information Science, University of Science and Technology of China; Microsoft Research Asia; Department of Electronic Engineering, Tsinghua University; Department of Electronic Engineering, Tsinghua University; Department of Electronic Engineering and Information Science, University of Science and Technology of China; Microsoft Research Asia", "aff_domain": "; ;microsoft.com; ; ; ; ", "email": "; ;microsoft.com; ; ; ; ", "github": "https://github.com/SOPR-T/SOPR-T", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/jin22f.html", "aff_unique_index": "0;1;2;0;0;1;2", "aff_unique_norm": "Tsinghua University;University of Science and Technology of China;Microsoft", "aff_unique_dep": "Department of Electronic Engineering;Department of Electronic Engineering and Information Science;Research", "aff_unique_url": "https://www.tsinghua.edu.cn;http://www.ustc.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "THU;USTC;MSR Asia", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Surrogate Likelihoods for Variational Annealed Importance Sampling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16031", "id": "16031", "proceeding": "https://proceedings.mlr.press/v162/jankowiak22a.html", "poster": "/media/PosterPDFs/ICML%202022/1e056d2b0ebd5c878c550da6ac5d3724_7SnCeFl.png?t=1657553437.638415", "slides": "", "author_site": "Martin Jankowiak, Du Phan", "author": "Martin Jankowiak; Du Phan", "abstract": "Variational inference is a powerful paradigm for approximate Bayesian inference with a number of appealing properties, including support for model learning and data subsampling. By contrast MCMC methods like Hamiltonian Monte Carlo do not share these properties but remain attractive since, contrary to parametric methods, MCMC is asymptotically unbiased. For these reasons researchers have sought to combine the strengths of both classes of algorithms, with recent approaches coming closer to realizing this vision in practice. However, supporting data subsampling in these hybrid methods can be a challenge, a shortcoming that we address by introducing a surrogate likelihood that can be learned jointly with other variational parameters. We argue theoretically that the resulting algorithm allows an intuitive trade-off between inference fidelity and computational cost. In an extensive empirical comparison we show that our method performs well in practice and that it is well-suited for black-box inference in probabilistic programming frameworks.", "bibtex": "@InProceedings{pmlr-v162-jankowiak22a,\n title = \t {Surrogate Likelihoods for Variational Annealed Importance Sampling},\n author = {Jankowiak, Martin and Phan, Du},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9881--9901},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jankowiak22a/jankowiak22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jankowiak22a.html},\n abstract = \t {Variational inference is a powerful paradigm for approximate Bayesian inference with a number of appealing properties, including support for model learning and data subsampling. By contrast MCMC methods like Hamiltonian Monte Carlo do not share these properties but remain attractive since, contrary to parametric methods, MCMC is asymptotically unbiased. For these reasons researchers have sought to combine the strengths of both classes of algorithms, with recent approaches coming closer to realizing this vision in practice. However, supporting data subsampling in these hybrid methods can be a challenge, a shortcoming that we address by introducing a surrogate likelihood that can be learned jointly with other variational parameters. We argue theoretically that the resulting algorithm allows an intuitive trade-off between inference fidelity and computational cost. In an extensive empirical comparison we show that our method performs well in practice and that it is well-suited for black-box inference in probabilistic programming frameworks.}\n}", "pdf": "https://proceedings.mlr.press/v162/jankowiak22a/jankowiak22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/jankowiak22a-supp.zip", "pdf_size": 604597, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9294195756749496542&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Broad Institute, Cambridge, MA, USA; Google Research, Cambridge, MA, USA", "aff_domain": "broadinstitute.org; ", "email": "broadinstitute.org; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/jankowiak22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Broad Institute;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.broadinstitute.org;https://research.google", "aff_unique_abbr": ";Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Symmetric Machine Theory of Mind", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16607", "id": "16607", "proceeding": "https://proceedings.mlr.press/v162/sclar22a.html", "poster": "/media/PosterPDFs/ICML%202022/c570c225d1fb8a72ad79995dd17a77bc.png?t=1657932875.2711213", "slides": "", "author_site": "Melanie Sclar, Graham Neubig, Yonatan Bisk", "author": "Melanie Sclar; Graham Neubig; Yonatan Bisk", "abstract": "Theory of mind, the ability to model others\u2019 thoughts and desires, is a cornerstone of human social intelligence. This makes it an important challenge for the machine learning community, but previous works mainly attempt to design agents that model the \"mental state\" of others as passive observers or in specific predefined roles, such as in speaker-listener scenarios. In contrast, we propose to model machine theory of mind in a more general symmetric scenario. We introduce a multi-agent environment SymmToM where, like in real life, all agents can speak, listen, see other agents, and move freely through the world. Effective strategies to maximize an agent\u2019s reward require it to develop a theory of mind. We show that reinforcement learning agents that model the mental states of others achieve significant performance improvements over agents with no such theory of mind model. Importantly, our best agents still fail to achieve performance comparable to agents with access to the gold-standard mental state of other agents, demonstrating that the modeling of theory of mind in multi-agent scenarios is very much an open challenge.", "bibtex": "@InProceedings{pmlr-v162-sclar22a,\n title = \t {Symmetric Machine Theory of Mind},\n author = {Sclar, Melanie and Neubig, Graham and Bisk, Yonatan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19450--19466},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sclar22a/sclar22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sclar22a.html},\n abstract = \t {Theory of mind, the ability to model others\u2019 thoughts and desires, is a cornerstone of human social intelligence. This makes it an important challenge for the machine learning community, but previous works mainly attempt to design agents that model the \"mental state\" of others as passive observers or in specific predefined roles, such as in speaker-listener scenarios. In contrast, we propose to model machine theory of mind in a more general symmetric scenario. We introduce a multi-agent environment SymmToM where, like in real life, all agents can speak, listen, see other agents, and move freely through the world. Effective strategies to maximize an agent\u2019s reward require it to develop a theory of mind. We show that reinforcement learning agents that model the mental states of others achieve significant performance improvements over agents with no such theory of mind model. Importantly, our best agents still fail to achieve performance comparable to agents with access to the gold-standard mental state of other agents, demonstrating that the modeling of theory of mind in multi-agent scenarios is very much an open challenge.}\n}", "pdf": "https://proceedings.mlr.press/v162/sclar22a/sclar22a.pdf", "supp": "", "pdf_size": 1577729, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14676997486280015283&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Paul G. Allen School of Computer Science & Engineering, University of Washington + Language Technologies Institute, Carnegie Mellon University; Language Technologies Institute, Carnegie Mellon University; Language Technologies Institute, Carnegie Mellon University", "aff_domain": "cs.washington.edu;cs.cmu.edu;cs.cmu.edu", "email": "cs.washington.edu;cs.cmu.edu;cs.cmu.edu", "github": "https://github.com/msclar/symmtom", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sclar22a.html", "aff_unique_index": "0+1;1;1", "aff_unique_norm": "University of Washington;Carnegie Mellon University", "aff_unique_dep": "Paul G. Allen School of Computer Science & Engineering;Language Technologies Institute", "aff_unique_url": "https://www.washington.edu;https://www.cmu.edu", "aff_unique_abbr": "UW;CMU", "aff_campus_unique_index": "0+1;1;1", "aff_campus_unique": "Seattle;Pittsburgh", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "Synergy and Symmetry in Deep Learning: Interactions between the Data, Model, and Inference Algorithm", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16697", "id": "16697", "proceeding": "https://proceedings.mlr.press/v162/xiao22a.html", "poster": "/media/PosterPDFs/ICML%202022/01922cbeae89ad4d79ab769e84e7c5da.png?t=1657548719.4182277", "slides": "", "author_site": "Lechao Xiao, Jeffrey Pennington", "author": "Lechao Xiao; Jeffrey Pennington", "abstract": "Although learning in high dimensions is commonly believed to suffer from the curse of dimensionality, modern machine learning methods often exhibit an astonishing power to tackle a wide range of challenging real-world learning problems without using abundant amounts of data. How exactly these methods break this curse remains a fundamental open question in the theory of deep learning. While previous efforts have investigated this question by studying the data ($\\mathcal D$), model ($\\mathcal M$), and inference algorithm ($\\mathcal I$) as independent modules, in this paper we analyzes the triplet $(\\mathcal D, \\mathcal M, \\mathcal I)$ as an integrated system and identify important synergies that help mitigate the curse of dimensionality. We first study the basic symmetries associated with various learning algorithms ($\\mathcal M, \\mathcal I$), focusing on four prototypical architectures in deep learning: fully-connected networks, locally-connected networks, and convolutional networks with and without pooling. We find that learning is most efficient when these symmetries are compatible with those of the data distribution and that performance significantly deteriorates when any member of the \\dmi triplet is inconsistent or suboptimal.", "bibtex": "@InProceedings{pmlr-v162-xiao22a,\n title = \t {Synergy and Symmetry in Deep Learning: Interactions between the Data, Model, and Inference Algorithm},\n author = {Xiao, Lechao and Pennington, Jeffrey},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24347--24369},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xiao22a/xiao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/xiao22a.html},\n abstract = \t {Although learning in high dimensions is commonly believed to suffer from the curse of dimensionality, modern machine learning methods often exhibit an astonishing power to tackle a wide range of challenging real-world learning problems without using abundant amounts of data. How exactly these methods break this curse remains a fundamental open question in the theory of deep learning. While previous efforts have investigated this question by studying the data ($\\mathcal D$), model ($\\mathcal M$), and inference algorithm ($\\mathcal I$) as independent modules, in this paper we analyzes the triplet $(\\mathcal D, \\mathcal M, \\mathcal I)$ as an integrated system and identify important synergies that help mitigate the curse of dimensionality. We first study the basic symmetries associated with various learning algorithms ($\\mathcal M, \\mathcal I$), focusing on four prototypical architectures in deep learning: fully-connected networks, locally-connected networks, and convolutional networks with and without pooling. We find that learning is most efficient when these symmetries are compatible with those of the data distribution and that performance significantly deteriorates when any member of the \\dmi triplet is inconsistent or suboptimal.}\n}", "pdf": "https://proceedings.mlr.press/v162/xiao22a/xiao22a.pdf", "supp": "", "pdf_size": 12461761, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11014722180286609805&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Google Research, Brain Team; Google Research, Brain Team", "aff_domain": "google.com;google.com", "email": "google.com;google.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/xiao22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "TACTiS: Transformer-Attentional Copulas for Time Series", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16593", "id": "16593", "proceeding": "https://proceedings.mlr.press/v162/drouin22a.html", "poster": "/media/PosterPDFs/ICML%202022/514f94b7b871de0eacb221709d341aec.png?t=1657207167.39945", "slides": "", "author_site": "Alexandre Drouin, \u00c9tienne Marcotte, Nicolas Chapados", "author": "Alexandre Drouin; \u00c9tienne Marcotte; Nicolas Chapados", "abstract": "The estimation of time-varying quantities is a fundamental component of decision making in fields such as healthcare and finance. However, the practical utility of such estimates is limited by how accurately they quantify predictive uncertainty. In this work, we address the problem of estimating the joint predictive distribution of high-dimensional multivariate time series. We propose a versatile method, based on the transformer architecture, that estimates joint distributions using an attention-based decoder that provably learns to mimic the properties of non-parametric copulas. The resulting model has several desirable properties: it can scale to hundreds of time series, supports both forecasting and interpolation, can handle unaligned and non-uniformly sampled data, and can seamlessly adapt to missing data during training. We demonstrate these properties empirically and show that our model produces state-of-the-art predictions on multiple real-world datasets.", "bibtex": "@InProceedings{pmlr-v162-drouin22a,\n title = \t {{TACT}i{S}: Transformer-Attentional Copulas for Time Series},\n author = {Drouin, Alexandre and Marcotte, \\'Etienne and Chapados, Nicolas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5447--5493},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/drouin22a/drouin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/drouin22a.html},\n abstract = \t {The estimation of time-varying quantities is a fundamental component of decision making in fields such as healthcare and finance. However, the practical utility of such estimates is limited by how accurately they quantify predictive uncertainty. In this work, we address the problem of estimating the joint predictive distribution of high-dimensional multivariate time series. We propose a versatile method, based on the transformer architecture, that estimates joint distributions using an attention-based decoder that provably learns to mimic the properties of non-parametric copulas. The resulting model has several desirable properties: it can scale to hundreds of time series, supports both forecasting and interpolation, can handle unaligned and non-uniformly sampled data, and can seamlessly adapt to missing data during training. We demonstrate these properties empirically and show that our model produces state-of-the-art predictions on multiple real-world datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/drouin22a/drouin22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/drouin22a-supp.zip", "pdf_size": 6095144, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5604382526172400005&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "ServiceNow Research; ServiceNow Research; ServiceNow Research", "aff_domain": "servicenow.com;servicenow.com;servicenow.com", "email": "servicenow.com;servicenow.com;servicenow.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/drouin22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "ServiceNow", "aff_unique_dep": "Research", "aff_unique_url": "https://www.servicenow.com", "aff_unique_abbr": "ServiceNow", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "TAM: Topology-Aware Margin Loss for Class-Imbalanced Node Classification", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18103", "id": "18103", "proceeding": "https://proceedings.mlr.press/v162/song22a.html", "poster": "/media/PosterPDFs/ICML%202022/74f23f9e28cbc5ddaae8582f48642a59.png?t=1657724907.2306092", "slides": "", "author_site": "Jaeyun Song, Joonhyung Park, Eunho Yang", "author": "Jaeyun Song; Joonhyung Park; Eunho Yang", "abstract": "Learning unbiased node representations under class-imbalanced graph data is challenging due to interactions between adjacent nodes. Existing studies have in common that they compensate the minor class nodes \u2018as a group\u2019 according to their overall quantity (ignoring node connections in graph), which inevitably increase the false positive cases for major nodes. We hypothesize that the increase in these false positive cases is highly affected by the label distribution around each node and confirm it experimentally. In addition, in order to handle this issue, we propose Topology-Aware Margin (TAM) to reflect local topology on the learning objective. Our method compares the connectivity pattern of each node with the class-averaged counter-part and adaptively adjusts the margin accordingly based on that. Our method consistently exhibits superiority over the baselines on various node classification benchmark datasets with representative GNN architectures.", "bibtex": "@InProceedings{pmlr-v162-song22a,\n title = \t {{TAM}: Topology-Aware Margin Loss for Class-Imbalanced Node Classification},\n author = {Song, Jaeyun and Park, Joonhyung and Yang, Eunho},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20369--20383},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/song22a/song22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/song22a.html},\n abstract = \t {Learning unbiased node representations under class-imbalanced graph data is challenging due to interactions between adjacent nodes. Existing studies have in common that they compensate the minor class nodes \u2018as a group\u2019 according to their overall quantity (ignoring node connections in graph), which inevitably increase the false positive cases for major nodes. We hypothesize that the increase in these false positive cases is highly affected by the label distribution around each node and confirm it experimentally. In addition, in order to handle this issue, we propose Topology-Aware Margin (TAM) to reflect local topology on the learning objective. Our method compares the connectivity pattern of each node with the class-averaged counter-part and adaptively adjusts the margin accordingly based on that. Our method consistently exhibits superiority over the baselines on various node classification benchmark datasets with representative GNN architectures.}\n}", "pdf": "https://proceedings.mlr.press/v162/song22a/song22a.pdf", "supp": "", "pdf_size": 656440, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14444728857510176156&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Graduate School of AI, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, South Korea+AITRICS, Seoul, South Korea; Graduate School of AI, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, South Korea+AITRICS, Seoul, South Korea; Graduate School of AI, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, South Korea", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/song22a.html", "aff_unique_index": "0+1;0+1;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;AITRICS", "aff_unique_dep": "Graduate School of AI;", "aff_unique_url": "https://www.kaist.ac.kr;", "aff_unique_abbr": "KAIST;", "aff_campus_unique_index": "0+1;0+1;0", "aff_campus_unique": "Daejeon;Seoul", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "South Korea" }, { "title": "TPC: Transformation-Specific Smoothing for Point Cloud Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18365", "id": "18365", "proceeding": "https://proceedings.mlr.press/v162/chu22b.html", "poster": "/media/PosterPDFs/ICML%202022/08fc80de8121419136e443a70489c123.png?t=1657870433.8599374", "slides": "", "author_site": "Wenda Chu, Linyi Li, Bo Li", "author": "Wenda Chu; Linyi Li; Bo Li", "abstract": "Point cloud models with neural network architectures have achieved great success and been widely used in safety-critical applications, such as Lidar-based recognition systems in autonomous vehicles. However, such models are shown vulnerable against adversarial attacks which aim to apply stealthy semantic transformations such as rotation and tapering to mislead model predictions. In this paper, we propose a transformation-specific smoothing framework TPC, which provides tight and scalable robustness guarantees for point cloud models against semantic transformation attacks. We first categorize common 3D transformations into two categories: composable (e.g., rotation) and indirectly composable (e.g., tapering), and we present generic robustness certification strategies for both categories. We then specify unique certification protocols for a range of specific semantic transformations and derive strong robustness guarantees. Extensive experiments on several common 3D transformations show that TPC significantly outperforms the state of the art. For example, our framework boosts the certified accuracy against twisting transformation along z-axis (within $\\pm$20{\\textdegree}) from 20.3% to 83.8%. Codes and models are available at https://github.com/Qianhewu/Point-Cloud-Smoothing.", "bibtex": "@InProceedings{pmlr-v162-chu22b,\n title = \t {{TPC}: Transformation-Specific Smoothing for Point Cloud Models},\n author = {Chu, Wenda and Li, Linyi and Li, Bo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4035--4056},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chu22b/chu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/chu22b.html},\n abstract = \t {Point cloud models with neural network architectures have achieved great success and been widely used in safety-critical applications, such as Lidar-based recognition systems in autonomous vehicles. However, such models are shown vulnerable against adversarial attacks which aim to apply stealthy semantic transformations such as rotation and tapering to mislead model predictions. In this paper, we propose a transformation-specific smoothing framework TPC, which provides tight and scalable robustness guarantees for point cloud models against semantic transformation attacks. We first categorize common 3D transformations into two categories: composable (e.g., rotation) and indirectly composable (e.g., tapering), and we present generic robustness certification strategies for both categories. We then specify unique certification protocols for a range of specific semantic transformations and derive strong robustness guarantees. Extensive experiments on several common 3D transformations show that TPC significantly outperforms the state of the art. For example, our framework boosts the certified accuracy against twisting transformation along z-axis (within $\\pm$20{\\textdegree}) from 20.3% to 83.8%. Codes and models are available at https://github.com/Qianhewu/Point-Cloud-Smoothing.}\n}", "pdf": "https://proceedings.mlr.press/v162/chu22b/chu22b.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/chu22b-supp.zip", "pdf_size": 1692566, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1615330508189196141&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University, Beijing, P. R. China (work done during remote internship at UIUC)+University of Illinois Urbana-Champaign (UIUC), Illinois, USA; University of Illinois Urbana-Champaign (UIUC), Illinois, USA; University of Illinois Urbana-Champaign (UIUC), Illinois, USA", "aff_domain": "mails.tsinghua.edu.cn;illinois.edu;illinois.edu", "email": "mails.tsinghua.edu.cn;illinois.edu;illinois.edu", "github": "https://github.com/Qianhewu/Point-Cloud-Smoothing", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chu22b.html", "aff_unique_index": "0+1;1;1", "aff_unique_norm": "Tsinghua University;University of Illinois Urbana-Champaign", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://illinois.edu", "aff_unique_abbr": "Tsinghua;UIUC", "aff_campus_unique_index": "0+1;1;1", "aff_campus_unique": "Beijing;Urbana-Champaign", "aff_country_unique_index": "0+1;1;1", "aff_country_unique": "China;United States" }, { "title": "TSPipe: Learn from Teacher Faster with Pipelines", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18107", "id": "18107", "proceeding": "https://proceedings.mlr.press/v162/lim22a.html", "poster": "/media/PosterPDFs/ICML%202022/c16a5320fa475530d9583c34fd356ef5.png?t=1657507452.3127584", "slides": "", "author_site": "Hwijoon Lim, Yechan Kim, Sukmin Yun, Jinwoo Shin, Dongsu Han", "author": "Hwijoon Lim; Yechan Kim; Sukmin Yun; Jinwoo Shin; Dongsu Han", "abstract": "The teacher-student (TS) framework, training a (student) network by utilizing an auxiliary superior (teacher) network, has been adopted as a popular training paradigm in many machine learning schemes, since the seminal work\u2014Knowledge distillation (KD) for model compression and transfer learning. Many recent self-supervised learning (SSL) schemes also adopt the TS framework, where teacher networks are maintained as the moving average of student networks, called the momentum networks. This paper presents TSPipe, a pipelined approach to accelerate the training process of any TS frameworks including KD and SSL. Under the observation that the teacher network does not need a backward pass, our main idea is to schedule the computation of the teacher and student network separately, and fully utilize the GPU during training by interleaving the computations of the two networks and relaxing their dependencies. In case the teacher network requires a momentum update, we use delayed parameter updates only on the teacher network to attain high model accuracy. Compared to existing pipeline parallelism schemes, which sacrifice either training throughput or model accuracy, TSPipe provides better performance trade-offs, achieving up to 12.15x higher throughput.", "bibtex": "@InProceedings{pmlr-v162-lim22a,\n title = \t {{TSP}ipe: Learn from Teacher Faster with Pipelines},\n author = {Lim, Hwijoon and Kim, Yechan and Yun, Sukmin and Shin, Jinwoo and Han, Dongsu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13302--13312},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lim22a/lim22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lim22a.html},\n abstract = \t {The teacher-student (TS) framework, training a (student) network by utilizing an auxiliary superior (teacher) network, has been adopted as a popular training paradigm in many machine learning schemes, since the seminal work\u2014Knowledge distillation (KD) for model compression and transfer learning. Many recent self-supervised learning (SSL) schemes also adopt the TS framework, where teacher networks are maintained as the moving average of student networks, called the momentum networks. This paper presents TSPipe, a pipelined approach to accelerate the training process of any TS frameworks including KD and SSL. Under the observation that the teacher network does not need a backward pass, our main idea is to schedule the computation of the teacher and student network separately, and fully utilize the GPU during training by interleaving the computations of the two networks and relaxing their dependencies. In case the teacher network requires a momentum update, we use delayed parameter updates only on the teacher network to attain high model accuracy. Compared to existing pipeline parallelism schemes, which sacrifice either training throughput or model accuracy, TSPipe provides better performance trade-offs, achieving up to 12.15x higher throughput.}\n}", "pdf": "https://proceedings.mlr.press/v162/lim22a/lim22a.pdf", "supp": "", "pdf_size": 901355, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11084431265172504357&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "School of Electrical Engineering, KAIST, Daejeon, Republic of Korea+Kim Jaechul Graduate School of AI, KAIST, Daejeon, Republic of Korea; School of Electrical Engineering, KAIST, Daejeon, Republic of Korea+Kim Jaechul Graduate School of AI, KAIST, Daejeon, Republic of Korea; School of Electrical Engineering, KAIST, Daejeon, Republic of Korea+Kim Jaechul Graduate School of AI, KAIST, Daejeon, Republic of Korea; School of Electrical Engineering, KAIST, Daejeon, Republic of Korea+Kim Jaechul Graduate School of AI, KAIST, Daejeon, Republic of Korea; School of Electrical Engineering, KAIST, Daejeon, Republic of Korea+Kim Jaechul Graduate School of AI, KAIST, Daejeon, Republic of Korea", "aff_domain": "kaist.ac.kr; ; ; ;kaist.ac.kr", "email": "kaist.ac.kr; ; ; ;kaist.ac.kr", "github": "https://github.com/kaist-ina/TSPipe", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/lim22a.html", "aff_unique_index": "0+0;0+0;0+0;0+0;0+0", "aff_unique_norm": "KAIST", "aff_unique_dep": "School of Electrical Engineering", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "0+0;0+0;0+0;0+0;0+0", "aff_campus_unique": "Daejeon", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", "aff_country_unique": "South Korea" }, { "title": "TURF: Two-Factor, Universal, Robust, Fast Distribution Learning Algorithm", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17869", "id": "17869", "proceeding": "https://proceedings.mlr.press/v162/hao22a.html", "poster": "/media/PosterPDFs/ICML%202022/fcf55a303b71b84d326fb1d06e332a26.png?t=1658040840.6636627", "slides": "", "author_site": "Yi Hao, Ayush Jain, Alon Orlitsky, Vaishakh Ravindrakumar", "author": "Yi Hao; Ayush Jain; Alon Orlitsky; Vaishakh Ravindrakumar", "abstract": "Approximating distributions from their samples is a canonical statistical-learning problem. One of its most powerful and successful modalities approximates every distribution to an $\\ell_1$ distance essentially at most a constant times larger than its closest $t$-piece degree-$d$ polynomial, where $t\\ge1$ and $d\\ge0$. Letting $c_{t,d}$ denote the smallest such factor, clearly $c_{1,0}=1$, and it can be shown that $c_{t,d}\\ge 2$ for all other $t$ and $d$. Yet current computationally efficient algorithms show only $c_{t,1}\\le 2.25$ and the bound rises quickly to $c_{t,d}\\le 3$ for $d\\ge 9$. We derive a near-linear-time and essentially sample-optimal estimator that establishes $c_{t,d}=2$ for all $(t,d)\\ne(1,0)$. Additionally, for many practical distributions, the lowest approximation distance is achieved by polynomials with vastly varying number of pieces. We provide a method that estimates this number near-optimally, hence helps approach the best possible approximation. Experiments combining the two techniques confirm improved performance over existing methodologies.", "bibtex": "@InProceedings{pmlr-v162-hao22a,\n title = \t {{TURF}: Two-Factor, Universal, Robust, Fast Distribution Learning Algorithm},\n author = {Hao, Yi and Jain, Ayush and Orlitsky, Alon and Ravindrakumar, Vaishakh},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8427--8445},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hao22a/hao22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hao22a.html},\n abstract = \t {Approximating distributions from their samples is a canonical statistical-learning problem. One of its most powerful and successful modalities approximates every distribution to an $\\ell_1$ distance essentially at most a constant times larger than its closest $t$-piece degree-$d$ polynomial, where $t\\ge1$ and $d\\ge0$. Letting $c_{t,d}$ denote the smallest such factor, clearly $c_{1,0}=1$, and it can be shown that $c_{t,d}\\ge 2$ for all other $t$ and $d$. Yet current computationally efficient algorithms show only $c_{t,1}\\le 2.25$ and the bound rises quickly to $c_{t,d}\\le 3$ for $d\\ge 9$. We derive a near-linear-time and essentially sample-optimal estimator that establishes $c_{t,d}=2$ for all $(t,d)\\ne(1,0)$. Additionally, for many practical distributions, the lowest approximation distance is achieved by polynomials with vastly varying number of pieces. We provide a method that estimates this number near-optimally, hence helps approach the best possible approximation. Experiments combining the two techniques confirm improved performance over existing methodologies.}\n}", "pdf": "https://proceedings.mlr.press/v162/hao22a/hao22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/hao22a-supp.zip", "pdf_size": 492611, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sJmOD-Ry8dcJ:scholar.google.com/&scioq=TURF:+Two-Factor,+Universal,+Robust,+Fast+Distribution+Learning+Algorithm&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff": "Electrical and Computer Engineering, University of California, San Diego; Electrical and Computer Engineering, University of California, San Diego; Electrical and Computer Engineering, University of California, San Diego; Electrical and Computer Engineering, University of California, San Diego", "aff_domain": "ucsd.edu; ; ; ", "email": "ucsd.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/hao22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "Electrical and Computer Engineering", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Tackling Data Heterogeneity: A New Unified Framework for Decentralized SGD with Sample-induced Topology", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17883", "id": "17883", "proceeding": "https://proceedings.mlr.press/v162/huang22i.html", "poster": "/media/PosterPDFs/ICML%202022/32b683d9d8e73d3eeb6bf08fe0817402_hWHn9yH.png?t=1656720524.8347216", "slides": "/media/icml-2022/Slides/17883.pdf", "author_site": "Yan Huang, Ying Sun, Zehan Zhu, Changzhi Yan, Jinming Xu", "author": "Yan Huang; Ying Sun; Zehan Zhu; Changzhi Yan; Jinming Xu", "abstract": "We develop a general framework unifying several gradient-based stochastic optimization methods for empirical risk minimization problems both in centralized and distributed scenarios. The framework hinges on the introduction of an augmented graph consisting of nodes modeling the samples and edges modeling both the inter-device communication and intra-device stochastic gradient computation. By designing properly the topology of the augmented graph, we are able to recover as special cases the renowned Local-SGD and DSGD algorithms, and provide a unified perspective for variance-reduction (VR) and gradient-tracking (GT) methods such as SAGA, Local-SVRG and GT-SAGA. We also provide a unified convergence analysis for smooth and (strongly) convex objectives relying on a proper structured Lyapunov function, and the obtained rate can recover the best known results for many existing algorithms. The rate results further reveal that VR and GT methods can effectively eliminate data heterogeneity within and across devices, respectively, enabling the exact convergence of the algorithm to the optimal solution. Numerical experiments confirm the findings in this paper.", "bibtex": "@InProceedings{pmlr-v162-huang22i,\n title = \t {Tackling Data Heterogeneity: A New Unified Framework for Decentralized {SGD} with Sample-induced Topology},\n author = {Huang, Yan and Sun, Ying and Zhu, Zehan and Yan, Changzhi and Xu, Jinming},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9310--9345},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huang22i/huang22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/huang22i.html},\n abstract = \t {We develop a general framework unifying several gradient-based stochastic optimization methods for empirical risk minimization problems both in centralized and distributed scenarios. The framework hinges on the introduction of an augmented graph consisting of nodes modeling the samples and edges modeling both the inter-device communication and intra-device stochastic gradient computation. By designing properly the topology of the augmented graph, we are able to recover as special cases the renowned Local-SGD and DSGD algorithms, and provide a unified perspective for variance-reduction (VR) and gradient-tracking (GT) methods such as SAGA, Local-SVRG and GT-SAGA. We also provide a unified convergence analysis for smooth and (strongly) convex objectives relying on a proper structured Lyapunov function, and the obtained rate can recover the best known results for many existing algorithms. The rate results further reveal that VR and GT methods can effectively eliminate data heterogeneity within and across devices, respectively, enabling the exact convergence of the algorithm to the optimal solution. Numerical experiments confirm the findings in this paper.}\n}", "pdf": "https://proceedings.mlr.press/v162/huang22i/huang22i.pdf", "supp": "", "pdf_size": 16268250, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17552716735045434957&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "College of Control Science and Engineering, Zhejiang University, Hangzhou, China; School of Electrical Engineering and Computer Science, The Pennsylvania State University, PA 16802, USA; College of Control Science and Engineering, Zhejiang University, Hangzhou, China; College of Control Science and Engineering, Zhejiang University, Hangzhou, China; College of Control Science and Engineering, Zhejiang University, Hangzhou, China", "aff_domain": "zju.edu.cn; ; ; ; ", "email": "zju.edu.cn; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/huang22i.html", "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Zhejiang University;Pennsylvania State University", "aff_unique_dep": "College of Control Science and Engineering;School of Electrical Engineering and Computer Science", "aff_unique_url": "http://www.zju.edu.cn;https://www.psu.edu", "aff_unique_abbr": "ZJU;PSU", "aff_campus_unique_index": "0;1;0;0;0", "aff_campus_unique": "Hangzhou;University Park", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Tackling covariate shift with node-based Bayesian neural networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17367", "id": "17367", "proceeding": "https://proceedings.mlr.press/v162/trinh22a.html", "poster": "/media/PosterPDFs/ICML%202022/82debd8a12b498e765a11a8e51159440_qExaNRB.png?t=1657529451.9101923", "slides": "/media/icml-2022/Slides/17367_Lpvwx6c.pdf", "author_site": "Trung Trinh, Markus Heinonen, Luigi Acerbi, Samuel Kaski", "author": "Trung Q Trinh; Markus Heinonen; Luigi Acerbi; Samuel Kaski", "abstract": "Bayesian neural networks (BNNs) promise improved generalization under covariate shift by providing principled probabilistic representations of epistemic uncertainty. However, weight-based BNNs often struggle with high computational complexity of large-scale architectures and datasets. Node-based BNNs have recently been introduced as scalable alternatives, which induce epistemic uncertainty by multiplying each hidden node with latent random variables, while learning a point-estimate of the weights. In this paper, we interpret these latent noise variables as implicit representations of simple and domain-agnostic data perturbations during training, producing BNNs that perform well under covariate shift due to input corruptions. We observe that the diversity of the implicit corruptions depends on the entropy of the latent variables, and propose a straightforward approach to increase the entropy of these variables during training. We evaluate the method on out-of-distribution image classification benchmarks, and show improved uncertainty estimation of node-based BNNs under covariate shift due to input perturbations. As a side effect, the method also provides robustness against noisy training labels.", "bibtex": "@InProceedings{pmlr-v162-trinh22a,\n title = \t {Tackling covariate shift with node-based {B}ayesian neural networks},\n author = {Trinh, Trung Q and Heinonen, Markus and Acerbi, Luigi and Kaski, Samuel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21751--21775},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/trinh22a/trinh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/trinh22a.html},\n abstract = \t {Bayesian neural networks (BNNs) promise improved generalization under covariate shift by providing principled probabilistic representations of epistemic uncertainty. However, weight-based BNNs often struggle with high computational complexity of large-scale architectures and datasets. Node-based BNNs have recently been introduced as scalable alternatives, which induce epistemic uncertainty by multiplying each hidden node with latent random variables, while learning a point-estimate of the weights. In this paper, we interpret these latent noise variables as implicit representations of simple and domain-agnostic data perturbations during training, producing BNNs that perform well under covariate shift due to input corruptions. We observe that the diversity of the implicit corruptions depends on the entropy of the latent variables, and propose a straightforward approach to increase the entropy of these variables during training. We evaluate the method on out-of-distribution image classification benchmarks, and show improved uncertainty estimation of node-based BNNs under covariate shift due to input perturbations. As a side effect, the method also provides robustness against noisy training labels.}\n}", "pdf": "https://proceedings.mlr.press/v162/trinh22a/trinh22a.pdf", "supp": "", "pdf_size": 1421253, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8088780476336589916&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "aff": "Department of Computer Science, Aalto University, Finland; Department of Computer Science, Aalto University, Finland; Department of Computer Science, University of Helsinki, Finland; Department of Computer Science, Aalto University, Finland + Department of Computer Science, University of Manchester, UK", "aff_domain": "aalto.fi; ; ; ", "email": "aalto.fi; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/trinh22a.html", "aff_unique_index": "0;0;1;0+2", "aff_unique_norm": "Aalto University;University of Helsinki;University of Manchester", "aff_unique_dep": "Department of Computer Science;Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.aalto.fi;https://www.helsinki.fi;https://www.manchester.ac.uk", "aff_unique_abbr": "Aalto;UH;UoM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+1", "aff_country_unique": "Finland;United Kingdom" }, { "title": "Task-aware Privacy Preservation for Multi-dimensional Data", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16589", "id": "16589", "proceeding": "https://proceedings.mlr.press/v162/cheng22a.html", "poster": "/media/PosterPDFs/ICML%202022/7949e456002b28988d38185bd30e77fd.png?t=1655405702.1041915", "slides": "", "author_site": "Jiangnan Cheng, Ao Tang, Sandeep Chinchali", "author": "Jiangnan Cheng; Ao Tang; Sandeep Chinchali", "abstract": "Local differential privacy (LDP) can be adopted to anonymize richer user data attributes that will be input to sophisticated machine learning (ML) tasks. However, today\u2019s LDP approaches are largely task-agnostic and often lead to severe performance loss \u2013 they simply inject noise to all data attributes according to a given privacy budget, regardless of what features are most relevant for the ultimate task. In this paper, we address how to significantly improve the ultimate task performance with multi-dimensional user data by considering a task-aware privacy preservation problem. The key idea is to use an encoder-decoder framework to learn (and anonymize) a task-relevant latent representation of user data. We obtain an analytical near-optimal solution for the linear setting with mean-squared error (MSE) task loss. We also provide an approximate solution through a gradient-based learning algorithm for general nonlinear cases. Extensive experiments demonstrate that our task-aware approach significantly improves ultimate task accuracy compared to standard benchmark LDP approaches with the same level of privacy guarantee.", "bibtex": "@InProceedings{pmlr-v162-cheng22a,\n title = \t {Task-aware Privacy Preservation for Multi-dimensional Data},\n author = {Cheng, Jiangnan and Tang, Ao and Chinchali, Sandeep},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3835--3851},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/cheng22a/cheng22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/cheng22a.html},\n abstract = \t {Local differential privacy (LDP) can be adopted to anonymize richer user data attributes that will be input to sophisticated machine learning (ML) tasks. However, today\u2019s LDP approaches are largely task-agnostic and often lead to severe performance loss \u2013 they simply inject noise to all data attributes according to a given privacy budget, regardless of what features are most relevant for the ultimate task. In this paper, we address how to significantly improve the ultimate task performance with multi-dimensional user data by considering a task-aware privacy preservation problem. The key idea is to use an encoder-decoder framework to learn (and anonymize) a task-relevant latent representation of user data. We obtain an analytical near-optimal solution for the linear setting with mean-squared error (MSE) task loss. We also provide an approximate solution through a gradient-based learning algorithm for general nonlinear cases. Extensive experiments demonstrate that our task-aware approach significantly improves ultimate task accuracy compared to standard benchmark LDP approaches with the same level of privacy guarantee.}\n}", "pdf": "https://proceedings.mlr.press/v162/cheng22a/cheng22a.pdf", "supp": "", "pdf_size": 574378, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12634725104863101184&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "School of Electrical and Computer Engineering, Cornell University, Ithaca, NY; School of Electrical and Computer Engineering, Cornell University, Ithaca, NY; Department of Electrical and Computer Engineering, The University of Texas at Austin, Austin, TX", "aff_domain": "cornell.edu;cornell.edu;utexas.edu", "email": "cornell.edu;cornell.edu;utexas.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/cheng22a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Cornell University;University of Texas at Austin", "aff_unique_dep": "School of Electrical and Computer Engineering;Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.cornell.edu;https://www.utexas.edu", "aff_unique_abbr": "Cornell;UT Austin", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Ithaca;Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Tell me why! Explanations support learning relational and causal structure", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16191", "id": "16191", "proceeding": "https://proceedings.mlr.press/v162/lampinen22a.html", "poster": "/media/PosterPDFs/ICML%202022/49ad23d1ec9fa4bd8d77d02681df5cfa.png?t=1657721285.134659", "slides": "", "author_site": "Andrew Lampinen, Nicholas Roy, Ishita Dasgupta, Stephanie Chan, Allison Tam, James McClelland, Chen Yan, Adam Santoro, Neil Rabinowitz, Jane Wang, Feilx Hill", "author": "Andrew K Lampinen; Nicholas Roy; Ishita Dasgupta; Stephanie Cy Chan; Allison Tam; James Mcclelland; Chen Yan; Adam Santoro; Neil C Rabinowitz; Jane Wang; Felix Hill", "abstract": "Inferring the abstract relational and causal structure of the world is a major challenge for reinforcement-learning (RL) agents. For humans, language{\u2014}particularly in the form of explanations{\u2014}plays a considerable role in overcoming this challenge. Here, we show that language can play a similar role for deep RL agents in complex environments. While agents typically struggle to acquire relational and causal knowledge, augmenting their experience by training them to predict language descriptions and explanations can overcome these limitations. We show that language can help agents learn challenging relational tasks, and examine which aspects of language contribute to its benefits. We then show that explanations can help agents to infer not only relational but also causal structure. Language can shape the way that agents to generalize out-of-distribution from ambiguous, causally-confounded training, and explanations even allow agents to learn to perform experimental interventions to identify causal relationships. Our results suggest that language description and explanation may be powerful tools for improving agent learning and generalization.", "bibtex": "@InProceedings{pmlr-v162-lampinen22a,\n title = \t {Tell me why! {E}xplanations support learning relational and causal structure},\n author = {Lampinen, Andrew K and Roy, Nicholas and Dasgupta, Ishita and Chan, Stephanie Cy and Tam, Allison and Mcclelland, James and Yan, Chen and Santoro, Adam and Rabinowitz, Neil C and Wang, Jane and Hill, Felix},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11868--11890},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lampinen22a/lampinen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/lampinen22a.html},\n abstract = \t {Inferring the abstract relational and causal structure of the world is a major challenge for reinforcement-learning (RL) agents. For humans, language{\u2014}particularly in the form of explanations{\u2014}plays a considerable role in overcoming this challenge. Here, we show that language can play a similar role for deep RL agents in complex environments. While agents typically struggle to acquire relational and causal knowledge, augmenting their experience by training them to predict language descriptions and explanations can overcome these limitations. We show that language can help agents learn challenging relational tasks, and examine which aspects of language contribute to its benefits. We then show that explanations can help agents to infer not only relational but also causal structure. Language can shape the way that agents to generalize out-of-distribution from ambiguous, causally-confounded training, and explanations even allow agents to learn to perform experimental interventions to identify causal relationships. Our results suggest that language description and explanation may be powerful tools for improving agent learning and generalization.}\n}", "pdf": "https://proceedings.mlr.press/v162/lampinen22a/lampinen22a.pdf", "supp": "", "pdf_size": 1543621, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9093718010434750052&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK", "aff_domain": "deepmind.com; ; ; ; ; ; ; ; ; ; ", "email": "deepmind.com; ; ; ; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 11, "oa": "https://proceedings.mlr.press/v162/lampinen22a.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Temporal Difference Learning for Model Predictive Control", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17049", "id": "17049", "proceeding": "https://proceedings.mlr.press/v162/hansen22a.html", "poster": "/media/PosterPDFs/ICML%202022/e1226495c14f1a62ae17aa76c1f0d457.png?t=1657260335.9410415", "slides": "/media/icml-2022/Slides/17049.pdf", "author_site": "Nicklas Hansen, Hao Su, Xiaolong Wang", "author": "Nicklas A Hansen; Hao Su; Xiaolong Wang", "abstract": "Data-driven model predictive control has two key advantages over model-free methods: a potential for improved sample efficiency through model learning, and better performance as computational budget for planning increases. However, it is both costly to plan over long horizons and challenging to obtain an accurate model of the environment. In this work, we combine the strengths of model-free and model-based methods. We use a learned task-oriented latent dynamics model for local trajectory optimization over a short horizon, and use a learned terminal value function to estimate long-term return, both of which are learned jointly by temporal difference learning. Our method, TD-MPC, achieves superior sample efficiency and asymptotic performance over prior work on both state and image-based continuous control tasks from DMControl and Meta-World. Code and videos are available at https://nicklashansen.github.io/td-mpc.", "bibtex": "@InProceedings{pmlr-v162-hansen22a,\n title = \t {Temporal Difference Learning for Model Predictive Control},\n author = {Hansen, Nicklas A and Su, Hao and Wang, Xiaolong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8387--8406},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hansen22a/hansen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hansen22a.html},\n abstract = \t {Data-driven model predictive control has two key advantages over model-free methods: a potential for improved sample efficiency through model learning, and better performance as computational budget for planning increases. However, it is both costly to plan over long horizons and challenging to obtain an accurate model of the environment. In this work, we combine the strengths of model-free and model-based methods. We use a learned task-oriented latent dynamics model for local trajectory optimization over a short horizon, and use a learned terminal value function to estimate long-term return, both of which are learned jointly by temporal difference learning. Our method, TD-MPC, achieves superior sample efficiency and asymptotic performance over prior work on both state and image-based continuous control tasks from DMControl and Meta-World. Code and videos are available at https://nicklashansen.github.io/td-mpc.}\n}", "pdf": "https://proceedings.mlr.press/v162/hansen22a/hansen22a.pdf", "supp": "", "pdf_size": 12435898, "gs_citation": 278, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10762661949285432757&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "UC San Diego; UC San Diego; UC San Diego", "aff_domain": "ucsd.edu; ; ", "email": "ucsd.edu; ; ", "github": "https://nicklashansen.github.io/td-mpc", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/hansen22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Test-Time Training Can Close the Natural Distribution Shift Performance Gap in Deep Learning Based Compressed Sensing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17923", "id": "17923", "proceeding": "https://proceedings.mlr.press/v162/darestani22a.html", "poster": "/media/PosterPDFs/ICML%202022/fa3060edb66e6ff4507886f9912e1ab9.png?t=1657556536.4785132", "slides": "", "author_site": "Mohammad Zalbagi Darestani, Jiayu Liu, Reinhard Heckel", "author": "Mohammad Zalbagi Darestani; Jiayu Liu; Reinhard Heckel", "abstract": "Deep learning based image reconstruction methods outperform traditional methods. However, neural networks suffer from a performance drop when applied to images from a different distribution than the training images. For example, a model trained for reconstructing knees in accelerated magnetic resonance imaging (MRI) does not reconstruct brains well, even though the same network trained on brains reconstructs brains perfectly well. Thus there is a distribution shift performance gap for a given neural network, defined as the difference in performance when training on a distribution $P$ and training on another distribution $Q$, and evaluating both models on $Q$. In this work, we propose a domain adaptation method for deep learning based compressive sensing that relies on self-supervision during training paired with test-time training at inference. We show that for four natural distribution shifts, this method essentially closes the distribution shift performance gap for state-of-the-art architectures for accelerated MRI.", "bibtex": "@InProceedings{pmlr-v162-darestani22a,\n title = \t {Test-Time Training Can Close the Natural Distribution Shift Performance Gap in Deep Learning Based Compressed Sensing},\n author = {Darestani, Mohammad Zalbagi and Liu, Jiayu and Heckel, Reinhard},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4754--4776},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/darestani22a/darestani22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/darestani22a.html},\n abstract = \t {Deep learning based image reconstruction methods outperform traditional methods. However, neural networks suffer from a performance drop when applied to images from a different distribution than the training images. For example, a model trained for reconstructing knees in accelerated magnetic resonance imaging (MRI) does not reconstruct brains well, even though the same network trained on brains reconstructs brains perfectly well. Thus there is a distribution shift performance gap for a given neural network, defined as the difference in performance when training on a distribution $P$ and training on another distribution $Q$, and evaluating both models on $Q$. In this work, we propose a domain adaptation method for deep learning based compressive sensing that relies on self-supervision during training paired with test-time training at inference. We show that for four natural distribution shifts, this method essentially closes the distribution shift performance gap for state-of-the-art architectures for accelerated MRI.}\n}", "pdf": "https://proceedings.mlr.press/v162/darestani22a/darestani22a.pdf", "supp": "", "pdf_size": 1662173, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17586372982715627644&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Electrical and Computer Engineering, Rice University; Department of Electrical and Computer Engineering, Technical University of Munich; Department of Electrical and Computer Engineering, Rice University", "aff_domain": "rice.edu; ;rice.edu", "email": "rice.edu; ;rice.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/darestani22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Rice University;Technical University of Munich", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.rice.edu;https://www.tum.de", "aff_unique_abbr": "Rice;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;Germany" }, { "title": "The Algebraic Path Problem for Graph Metrics", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17831", "id": "17831", "proceeding": "https://proceedings.mlr.press/v162/sanmarti-n22a.html", "poster": "/media/PosterPDFs/ICML%202022/d736bb10d83a904aefc1d6ce93dc54b8.png?t=1657211059.6070156", "slides": "", "author_site": "Enrique Fita Sanmart\u00edn, Sebastian Damrich, Fred Hamprecht", "author": "Enrique Fita Sanmart\u0131\u0301n; Sebastian Damrich; Fred Hamprecht", "abstract": "Finding paths with optimal properties is a foundational problem in computer science. The notions of shortest paths (minimal sum of edge costs), minimax paths (minimal maximum edge weight), reliability of a path and many others all arise as special cases of the \"algebraic path problem\" (APP). Indeed, the APP formalizes the relation between different semirings such as min-plus, min-max and the distances they induce. We here clarify, for the first time, the relation between the potential distance and the log-semiring. We also define a new unifying family of algebraic structures that include all above-mentioned path problems as well as the commute cost and others as special or limiting cases. The family comprises not only semirings but also strong bimonoids (that is, semirings without distributivity). We call this new and very general distance the \"log-norm distance\". Finally, we derive some sufficient conditions which ensure that the APP associated with a semiring defines a metric over an arbitrary graph.", "bibtex": "@InProceedings{pmlr-v162-sanmarti-n22a,\n title = \t {The Algebraic Path Problem for Graph Metrics},\n author = {Sanmart\\'{\\i}n, Enrique Fita and Damrich, Sebastian and Hamprecht, Fred},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19178--19204},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sanmarti-n22a/sanmarti-n22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sanmarti-n22a.html},\n abstract = \t {Finding paths with optimal properties is a foundational problem in computer science. The notions of shortest paths (minimal sum of edge costs), minimax paths (minimal maximum edge weight), reliability of a path and many others all arise as special cases of the \"algebraic path problem\" (APP). Indeed, the APP formalizes the relation between different semirings such as min-plus, min-max and the distances they induce. We here clarify, for the first time, the relation between the potential distance and the log-semiring. We also define a new unifying family of algebraic structures that include all above-mentioned path problems as well as the commute cost and others as special or limiting cases. The family comprises not only semirings but also strong bimonoids (that is, semirings without distributivity). We call this new and very general distance the \"log-norm distance\". Finally, we derive some sufficient conditions which ensure that the APP associated with a semiring defines a metric over an arbitrary graph.}\n}", "pdf": "https://proceedings.mlr.press/v162/sanmarti-n22a/sanmarti-n22a.pdf", "supp": "", "pdf_size": 3151698, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13645814866081987896&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "IWR at Heidelberg University; IWR at Heidelberg University; IWR at Heidelberg University", "aff_domain": ";;", "email": ";;", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/sanmarti-n22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Heidelberg University", "aff_unique_dep": "Interdisciplinary Center for Scientific Computing (IWR)", "aff_unique_url": "https://www.iwr.uni-heidelberg.de/", "aff_unique_abbr": "IWR", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Heidelberg", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "The CLRS Algorithmic Reasoning Benchmark", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16547", "id": "16547", "proceeding": "https://proceedings.mlr.press/v162/velickovic22a.html", "poster": "/media/PosterPDFs/ICML%202022/f8580959e35cb0934479bb007fb241c2.png?t=1657203105.4132416", "slides": "", "author_site": "Petar Veli\u010dkovi\u0107, Adri\u00e0 Puigdomenech Badia, David Budden, Razvan Pascanu, Andrea Banino, Misha Dashevskiy, Raia Hadsell, Charles Blundell", "author": "Petar Veli\u010dkovi\u0107; Adri\u00e0 Puigdom\u00e8nech Badia; David Budden; Razvan Pascanu; Andrea Banino; Misha Dashevskiy; Raia Hadsell; Charles Blundell", "abstract": "Learning representations of algorithms is an emerging area of machine learning, seeking to bridge concepts from neural networks with classical algorithms. Several important works have investigated whether neural networks can effectively reason like algorithms, typically by learning to execute them. The common trend in the area, however, is to generate targeted kinds of algorithmic data to evaluate specific hypotheses, making results hard to transfer across publications, and increasing the barrier of entry. To consolidate progress and work towards unified evaluation, we propose the CLRS Algorithmic Reasoning Benchmark, covering classical algorithms from the Introduction to Algorithms textbook. Our benchmark spans a variety of algorithmic reasoning procedures, including sorting, searching, dynamic programming, graph algorithms, string algorithms and geometric algorithms. We perform extensive experiments to demonstrate how several popular algorithmic reasoning baselines perform on these tasks, and consequently, highlight links to several open challenges. Our library is readily available at https://github.com/deepmind/clrs.", "bibtex": "@InProceedings{pmlr-v162-velickovic22a,\n title = \t {The {CLRS} Algorithmic Reasoning Benchmark},\n author = {Veli{\\v{c}}kovi{\\'c}, Petar and Badia, Adri{\\`a} Puigdom{\\`e}nech and Budden, David and Pascanu, Razvan and Banino, Andrea and Dashevskiy, Misha and Hadsell, Raia and Blundell, Charles},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22084--22102},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/velickovic22a/velickovic22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/velickovic22a.html},\n abstract = \t {Learning representations of algorithms is an emerging area of machine learning, seeking to bridge concepts from neural networks with classical algorithms. Several important works have investigated whether neural networks can effectively reason like algorithms, typically by learning to execute them. The common trend in the area, however, is to generate targeted kinds of algorithmic data to evaluate specific hypotheses, making results hard to transfer across publications, and increasing the barrier of entry. To consolidate progress and work towards unified evaluation, we propose the CLRS Algorithmic Reasoning Benchmark, covering classical algorithms from the Introduction to Algorithms textbook. Our benchmark spans a variety of algorithmic reasoning procedures, including sorting, searching, dynamic programming, graph algorithms, string algorithms and geometric algorithms. We perform extensive experiments to demonstrate how several popular algorithmic reasoning baselines perform on these tasks, and consequently, highlight links to several open challenges. Our library is readily available at https://github.com/deepmind/clrs.}\n}", "pdf": "https://proceedings.mlr.press/v162/velickovic22a/velickovic22a.pdf", "supp": "", "pdf_size": 2985175, "gs_citation": 119, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9181302241653376962&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind", "aff_domain": "deepmind.com; ; ; ; ; ; ; ", "email": "deepmind.com; ; ; ; ; ; ; ", "github": "https://github.com/deepmind/clrs", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/velickovic22a.html", "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "The Combinatorial Brain Surgeon: Pruning Weights That Cancel One Another in Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18321", "id": "18321", "proceeding": "https://proceedings.mlr.press/v162/yu22f.html", "poster": "/media/PosterPDFs/ICML%202022/f72f78a365657d56853b6867fb37dc3c_Mz1bpKD.png?t=1658118805.8169649", "slides": "/media/icml-2022/Slides/18321_ZzGozOj.pdf", "author_site": "Xin Yu, Thiago Serra, Srikumar Ramalingam, Shandian Zhe", "author": "Xin Yu; Thiago Serra; Srikumar Ramalingam; Shandian Zhe", "abstract": "Neural networks tend to achieve better accuracy with training if they are larger {\u2014} even if the resulting models are overparameterized. Nevertheless, carefully removing such excess of parameters before, during, or after training may also produce models with similar or even improved accuracy. In many cases, that can be curiously achieved by heuristics as simple as removing a percentage of the weights with the smallest absolute value {\u2014} even though absolute value is not a perfect proxy for weight relevance. With the premise that obtaining significantly better performance from pruning depends on accounting for the combined effect of removing multiple weights, we revisit one of the classic approaches for impact-based pruning: the Optimal Brain Surgeon (OBS). We propose a tractable heuristic for solving the combinatorial extension of OBS, in which we select weights for simultaneous removal, and we combine it with a single-pass systematic update of unpruned weights. Our selection method outperforms other methods for high sparsity, and the single-pass weight update is also advantageous if applied after those methods.", "bibtex": "@InProceedings{pmlr-v162-yu22f,\n title = \t {The Combinatorial Brain Surgeon: Pruning Weights That Cancel One Another in Neural Networks},\n author = {Yu, Xin and Serra, Thiago and Ramalingam, Srikumar and Zhe, Shandian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25668--25683},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yu22f/yu22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/yu22f.html},\n abstract = \t {Neural networks tend to achieve better accuracy with training if they are larger {\u2014} even if the resulting models are overparameterized. Nevertheless, carefully removing such excess of parameters before, during, or after training may also produce models with similar or even improved accuracy. In many cases, that can be curiously achieved by heuristics as simple as removing a percentage of the weights with the smallest absolute value {\u2014} even though absolute value is not a perfect proxy for weight relevance. With the premise that obtaining significantly better performance from pruning depends on accounting for the combined effect of removing multiple weights, we revisit one of the classic approaches for impact-based pruning: the Optimal Brain Surgeon (OBS). We propose a tractable heuristic for solving the combinatorial extension of OBS, in which we select weights for simultaneous removal, and we combine it with a single-pass systematic update of unpruned weights. Our selection method outperforms other methods for high sparsity, and the single-pass weight update is also advantageous if applied after those methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/yu22f/yu22f.pdf", "supp": "", "pdf_size": 909817, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2256443788852509146&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "University of Utah, Salt Lake City, UT, United States+Google Research, New York, NY, United States; Bucknell University, Lewisburg, PA, United States+Google Research, New York, NY, United States; Google Research, New York, NY, United States; University of Utah, Salt Lake City, UT, United States", "aff_domain": "cs.utah.edu;bucknell.edu;google.com;cs.utah.edu", "email": "cs.utah.edu;bucknell.edu;google.com;cs.utah.edu", "github": "github.com/yuxwind/CBS", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/yu22f.html", "aff_unique_index": "0+1;2+1;1;0", "aff_unique_norm": "University of Utah;Google;Bucknell University", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.utah.edu;https://research.google;https://www.bucknell.edu", "aff_unique_abbr": "U of U;Google Research;Bucknell", "aff_campus_unique_index": "0+1;2+1;1;0", "aff_campus_unique": "Salt Lake City;New York;Lewisburg", "aff_country_unique_index": "0+0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "The Complexity of k-Means Clustering when Little is Known", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17703", "id": "17703", "proceeding": "https://proceedings.mlr.press/v162/ganian22a.html", "poster": "/media/PosterPDFs/ICML%202022/7ffd85d93a3e4de5c490d304ccd9f864.png?t=1658161668.491715", "slides": "", "author_site": "Robert Ganian, Thekla Hamm, Viktoriia Korchemna, Karolina Okrasa, Kirill Simonov", "author": "Robert Ganian; Thekla Hamm; Viktoriia Korchemna; Karolina Okrasa; Kirill Simonov", "abstract": "In the area of data analysis and arguably even in machine learning as a whole, few approaches have been as impactful as the classical k-means clustering. Here, we study the complexity of k-means clustering in settings where most of the data is not known or simply irrelevant. To obtain a more fine-grained understanding of the tractability of this clustering problem, we apply the parameterized complexity paradigm and obtain three new algorithms for k-means clustering of incomplete data: one for the clustering of bounded-domain (i.e., integer) data, and two incomparable algorithms that target real-valued data. Our approach is based on exploiting structural properties of a graphical encoding of the missing entries, and we show that tractability can be achieved using significantly less restrictive parameterizations than in the complementary case of few missing entries.", "bibtex": "@InProceedings{pmlr-v162-ganian22a,\n title = \t {The Complexity of k-Means Clustering when Little is Known},\n author = {Ganian, Robert and Hamm, Thekla and Korchemna, Viktoriia and Okrasa, Karolina and Simonov, Kirill},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6960--6987},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ganian22a/ganian22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ganian22a.html},\n abstract = \t {In the area of data analysis and arguably even in machine learning as a whole, few approaches have been as impactful as the classical k-means clustering. Here, we study the complexity of k-means clustering in settings where most of the data is not known or simply irrelevant. To obtain a more fine-grained understanding of the tractability of this clustering problem, we apply the parameterized complexity paradigm and obtain three new algorithms for k-means clustering of incomplete data: one for the clustering of bounded-domain (i.e., integer) data, and two incomparable algorithms that target real-valued data. Our approach is based on exploiting structural properties of a graphical encoding of the missing entries, and we show that tractability can be achieved using significantly less restrictive parameterizations than in the complementary case of few missing entries.}\n}", "pdf": "https://proceedings.mlr.press/v162/ganian22a/ganian22a.pdf", "supp": "", "pdf_size": 1043646, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15210947684519982656&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Algorithms and Complexity Group, TU Wien, Austria; Algorithms and Complexity Group, TU Wien, Austria; Algorithms and Complexity Group, TU Wien, Austria; Institute of Informatics, University of Warsaw, Poland; Algorithms and Complexity Group, TU Wien, Austria", "aff_domain": "gmail.com;ac.tuwien.ac.at;ac.tuwien.ac.at;mini.pw.edu.pl;ac.tuwien.ac.at", "email": "gmail.com;ac.tuwien.ac.at;ac.tuwien.ac.at;mini.pw.edu.pl;ac.tuwien.ac.at", "github": "", "project": "https://www.kaggle.com/netflix-inc/netflix-prize-data", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/ganian22a.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "TU Wien;University of Warsaw", "aff_unique_dep": "Algorithms and Complexity Group;Institute of Informatics", "aff_unique_url": "https://www.tuwien.ac.at;https://www.uw.edu.pl", "aff_unique_abbr": "TU Wien;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Austria;Poland" }, { "title": "The Dual Form of Neural Networks Revisited: Connecting Test Time Predictions to Training Patterns via Spotlights of Attention", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17227", "id": "17227", "proceeding": "https://proceedings.mlr.press/v162/irie22a.html", "poster": "/media/PosterPDFs/ICML%202022/c94a589bdd47870b1d74b258d1ce3b33.png?t=1657790234.5585814", "slides": "", "author_site": "Kazuki Irie, Robert Cordas, J\u00fcrgen Schmidhuber", "author": "Kazuki Irie; R\u00f3bert Csord\u00e1s; J\u00fcrgen Schmidhuber", "abstract": "Linear layers in neural networks (NNs) trained by gradient descent can be expressed as a key-value memory system which stores all training datapoints and the initial weights, and produces outputs using unnormalised dot attention over the entire training experience. While this has been technically known since the 1960s, no prior work has effectively studied the operations of NNs in such a form, presumably due to prohibitive time and space complexities and impractical model sizes, all of them growing linearly with the number of training patterns which may get very large. However, this dual formulation offers a possibility of directly visualising how an NN makes use of training patterns at test time, by examining the corresponding attention weights. We conduct experiments on small scale supervised image classification tasks in single-task, multi-task, and continual learning settings, as well as language modelling, and discuss potentials and limits of this view for better understanding and interpreting how NNs exploit training patterns. Our code is public.", "bibtex": "@InProceedings{pmlr-v162-irie22a,\n title = \t {The Dual Form of Neural Networks Revisited: Connecting Test Time Predictions to Training Patterns via Spotlights of Attention},\n author = {Irie, Kazuki and Csord{\\'a}s, R{\\'o}bert and Schmidhuber, J{\\\"u}rgen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9639--9659},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/irie22a/irie22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/irie22a.html},\n abstract = \t {Linear layers in neural networks (NNs) trained by gradient descent can be expressed as a key-value memory system which stores all training datapoints and the initial weights, and produces outputs using unnormalised dot attention over the entire training experience. While this has been technically known since the 1960s, no prior work has effectively studied the operations of NNs in such a form, presumably due to prohibitive time and space complexities and impractical model sizes, all of them growing linearly with the number of training patterns which may get very large. However, this dual formulation offers a possibility of directly visualising how an NN makes use of training patterns at test time, by examining the corresponding attention weights. We conduct experiments on small scale supervised image classification tasks in single-task, multi-task, and continual learning settings, as well as language modelling, and discuss potentials and limits of this view for better understanding and interpreting how NNs exploit training patterns. Our code is public.}\n}", "pdf": "https://proceedings.mlr.press/v162/irie22a/irie22a.pdf", "supp": "", "pdf_size": 1915662, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11337857580515349157&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "The Swiss AI Lab, IDSIA, USI & SUPSI, Lugano, Switzerland+AI Initiative, King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia; The Swiss AI Lab, IDSIA, USI & SUPSI, Lugano, Switzerland+AI Initiative, King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia; The Swiss AI Lab, IDSIA, USI & SUPSI, Lugano, Switzerland+AI Initiative, King Abdullah University of Science and Technology (KAUST), Thuwal, Saudi Arabia", "aff_domain": "idsia.ch;idsia.ch;idsia.ch", "email": "idsia.ch;idsia.ch;idsia.ch", "github": "https://github.com/robertcsordas/linear_layer_as_attention", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/irie22a.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "Swiss AI Lab IDSIA;King Abdullah University of Science and Technology", "aff_unique_dep": "AI Lab;AI Initiative", "aff_unique_url": "https://www.idsia.ch/;https://www.kaust.edu.sa", "aff_unique_abbr": "IDSIA;KAUST", "aff_campus_unique_index": "0+1;0+1;0+1", "aff_campus_unique": "Lugano;Thuwal", "aff_country_unique_index": "0+1;0+1;0+1", "aff_country_unique": "Switzerland;Saudi Arabia" }, { "title": "The Fundamental Price of Secure Aggregation in Differentially Private Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17529", "id": "17529", "proceeding": "https://proceedings.mlr.press/v162/chen22c.html", "poster": "/media/PosterPDFs/ICML%202022/4a64d913220fca4c33c140c6952688a8.png?t=1657415826.892301", "slides": "/media/icml-2022/Slides/17529_Np24Tz1.pdf", "author_site": "Wei-Ning Chen, Christopher Choquette Choo, Peter Kairouz, Ananda Suresh", "author": "Wei-Ning Chen; Christopher A Choquette Choo; Peter Kairouz; Ananda Theertha Suresh", "abstract": "We consider the problem of training a $d$ dimensional model with distributed differential privacy (DP) where secure aggregation (SecAgg) is used to ensure that the server only sees the noisy sum of $n$ model updates in every training round. Taking into account the constraints imposed by SecAgg, we characterize the fundamental communication cost required to obtain the best accuracy achievable under $\\varepsilon$ central DP (i.e. under a fully trusted server and no communication constraints). Our results show that $\\tilde{O}\\lp \\min(n^2\\varepsilon^2, d) \\rp$ bits per client are both sufficient and necessary, and this fundamental limit can be achieved by a linear scheme based on sparse random projections. This provides a significant improvement relative to state-of-the-art SecAgg distributed DP schemes which use $\\tilde{O}(d\\log(d/\\varepsilon^2))$ bits per client. Empirically, we evaluate our proposed scheme on real-world federated learning tasks. We find that our theoretical analysis is well matched in practice. In particular, we show that we can reduce the communication cost to under $1.78$ bits per parameter in realistic privacy settings without decreasing test-time performance. Our work hence theoretically and empirically specifies the fundamental price of using SecAgg.", "bibtex": "@InProceedings{pmlr-v162-chen22c,\n title = \t {The Fundamental Price of Secure Aggregation in Differentially Private Federated Learning},\n author = {Chen, Wei-Ning and Choo, Christopher A Choquette and Kairouz, Peter and Suresh, Ananda Theertha},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3056--3089},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22c/chen22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22c.html},\n abstract = \t {We consider the problem of training a $d$ dimensional model with distributed differential privacy (DP) where secure aggregation (SecAgg) is used to ensure that the server only sees the noisy sum of $n$ model updates in every training round. Taking into account the constraints imposed by SecAgg, we characterize the fundamental communication cost required to obtain the best accuracy achievable under $\\varepsilon$ central DP (i.e. under a fully trusted server and no communication constraints). Our results show that $\\tilde{O}\\lp \\min(n^2\\varepsilon^2, d) \\rp$ bits per client are both sufficient and necessary, and this fundamental limit can be achieved by a linear scheme based on sparse random projections. This provides a significant improvement relative to state-of-the-art SecAgg distributed DP schemes which use $\\tilde{O}(d\\log(d/\\varepsilon^2))$ bits per client. Empirically, we evaluate our proposed scheme on real-world federated learning tasks. We find that our theoretical analysis is well matched in practice. In particular, we show that we can reduce the communication cost to under $1.78$ bits per parameter in realistic privacy settings without decreasing test-time performance. Our work hence theoretically and empirically specifies the fundamental price of using SecAgg.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22c/chen22c.pdf", "supp": "", "pdf_size": 1083376, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2494404504227550780&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Stanford University + Google Research; Google Research; Google Research; Google Research", "aff_domain": "stanford.edu;google.com; ; ", "email": "stanford.edu;google.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/chen22c.html", "aff_unique_index": "0+1;1;1;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.stanford.edu;https://research.google", "aff_unique_abbr": "Stanford;Google Research", "aff_campus_unique_index": "0+1;1;1;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "The Geometry of Robust Value Functions", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17961", "id": "17961", "proceeding": "https://proceedings.mlr.press/v162/wang22k.html", "poster": "", "slides": "", "author_site": "Kaixin Wang, Navdeep Kumar, Kuangqi Zhou, Bryan Hooi, Jiashi Feng, Shie Mannor", "author": "Kaixin Wang; Navdeep Kumar; Kuangqi Zhou; Bryan Hooi; Jiashi Feng; Shie Mannor", "abstract": "The space of value functions is a fundamental concept in reinforcement learning. Characterizing its geometric properties may provide insights for optimization and representation. Existing works mainly focus on the value space for Markov Decision Processes (MDPs). In this paper, we study the geometry of the robust value space for the more general Robust MDPs (RMDPs) setting, where transition uncertainties are considered. Specifically, since we find it hard to directly adapt prior approaches to RMDPs, we start with revisiting the non-robust case, and introduce a new perspective that enables us to characterize both the non-robust and robust value space in a similar fashion. The key of this perspective is to decompose the value space, in a state-wise manner, into unions of hypersurfaces. Through our analysis, we show that the robust value space is determined by a set of conic hypersurfaces, each of which contains the robust values of all policies that agree on one state. Furthermore, we find that taking only extreme points in the uncertainty set is sufficient to determine the robust value space. Finally, we discuss some other aspects about the robust value space, including its non-convexity and policy agreement on multiple states.", "bibtex": "@InProceedings{pmlr-v162-wang22k,\n title = \t {The Geometry of Robust Value Functions},\n author = {Wang, Kaixin and Kumar, Navdeep and Zhou, Kuangqi and Hooi, Bryan and Feng, Jiashi and Mannor, Shie},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22727--22751},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22k/wang22k.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22k.html},\n abstract = \t {The space of value functions is a fundamental concept in reinforcement learning. Characterizing its geometric properties may provide insights for optimization and representation. Existing works mainly focus on the value space for Markov Decision Processes (MDPs). In this paper, we study the geometry of the robust value space for the more general Robust MDPs (RMDPs) setting, where transition uncertainties are considered. Specifically, since we find it hard to directly adapt prior approaches to RMDPs, we start with revisiting the non-robust case, and introduce a new perspective that enables us to characterize both the non-robust and robust value space in a similar fashion. The key of this perspective is to decompose the value space, in a state-wise manner, into unions of hypersurfaces. Through our analysis, we show that the robust value space is determined by a set of conic hypersurfaces, each of which contains the robust values of all policies that agree on one state. Furthermore, we find that taking only extreme points in the uncertainty set is sufficient to determine the robust value space. Finally, we discuss some other aspects about the robust value space, including its non-convexity and policy agreement on multiple states.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22k/wang22k.pdf", "supp": "", "pdf_size": 3957196, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6075972255127648122&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wang22k.html" }, { "title": "The Importance of Non-Markovianity in Maximum State Entropy Exploration", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16289", "id": "16289", "proceeding": "https://proceedings.mlr.press/v162/mutti22a.html", "poster": "/media/PosterPDFs/ICML%202022/d3630410c51e60941a9001a46871070e.png?t=1657525866.9892178", "slides": "", "author_site": "Mirco Mutti, Riccardo De Santi, Marcello Restelli", "author": "Mirco Mutti; Riccardo De Santi; Marcello Restelli", "abstract": "In the maximum state entropy exploration framework, an agent interacts with a reward-free environment to learn a policy that maximizes the entropy of the expected state visitations it is inducing. Hazan et al. (2019) noted that the class of Markovian stochastic policies is sufficient for the maximum state entropy objective, and exploiting non-Markovianity is generally considered pointless in this setting. In this paper, we argue that non-Markovianity is instead paramount for maximum state entropy exploration in a finite-sample regime. Especially, we recast the objective to target the expected entropy of the induced state visitations in a single trial. Then, we show that the class of non-Markovian deterministic policies is sufficient for the introduced objective, while Markovian policies suffer non-zero regret in general. However, we prove that the problem of finding an optimal non-Markovian policy is NP-hard. Despite this negative result, we discuss avenues to address the problem in a tractable way and how non-Markovian exploration could benefit the sample efficiency of online reinforcement learning in future works.", "bibtex": "@InProceedings{pmlr-v162-mutti22a,\n title = \t {The Importance of Non-Markovianity in Maximum State Entropy Exploration},\n author = {Mutti, Mirco and De Santi, Riccardo and Restelli, Marcello},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16223--16239},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mutti22a/mutti22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mutti22a.html},\n abstract = \t {In the maximum state entropy exploration framework, an agent interacts with a reward-free environment to learn a policy that maximizes the entropy of the expected state visitations it is inducing. Hazan et al. (2019) noted that the class of Markovian stochastic policies is sufficient for the maximum state entropy objective, and exploiting non-Markovianity is generally considered pointless in this setting. In this paper, we argue that non-Markovianity is instead paramount for maximum state entropy exploration in a finite-sample regime. Especially, we recast the objective to target the expected entropy of the induced state visitations in a single trial. Then, we show that the class of non-Markovian deterministic policies is sufficient for the introduced objective, while Markovian policies suffer non-zero regret in general. However, we prove that the problem of finding an optimal non-Markovian policy is NP-hard. Despite this negative result, we discuss avenues to address the problem in a tractable way and how non-Markovian exploration could benefit the sample efficiency of online reinforcement learning in future works.}\n}", "pdf": "https://proceedings.mlr.press/v162/mutti22a/mutti22a.pdf", "supp": "", "pdf_size": 513525, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2291117086854645081&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Politecnico di Milano + Universit \u00e0 di Bologna; ETH Zurich; Politecnico di Milano", "aff_domain": "polimi.it;ethz.ch; ", "email": "polimi.it;ethz.ch; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/mutti22a.html", "aff_unique_index": "0+1;2;0", "aff_unique_norm": "Politecnico di Milano;University of Bologna;ETH Zurich", "aff_unique_dep": ";;", "aff_unique_url": "https://www.polimi.it;https://www.unibo.it;https://www.ethz.ch", "aff_unique_abbr": "Polimi;Unibo;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0", "aff_country_unique": "Italy;Switzerland" }, { "title": "The Infinite Contextual Graph Markov Model", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16215", "id": "16215", "proceeding": "https://proceedings.mlr.press/v162/castellana22a.html", "poster": "/media/PosterPDFs/ICML%202022/b069b3415151fa7217e870017374de7c.png?t=1657529000.5807805", "slides": "", "author_site": "Daniele Castellana, Federico Errica, Davide Bacciu, Alessio Micheli", "author": "Daniele Castellana; Federico Errica; Davide Bacciu; Alessio Micheli", "abstract": "The Contextual Graph Markov Model (CGMM) is a deep, unsupervised, and probabilistic model for graphs that is trained incrementally on a layer-by-layer basis. As with most Deep Graph Networks, an inherent limitation is the need to perform an extensive model selection to choose the proper size of each layer\u2019s latent representation. In this paper, we address this problem by introducing the Infinite Contextual Graph Markov Model (iCGMM), the first deep Bayesian nonparametric model for graph learning. During training, iCGMM can adapt the complexity of each layer to better fit the underlying data distribution. On 8 graph classification tasks, we show that iCGMM: i) successfully recovers or improves CGMM\u2019s performances while reducing the hyper-parameters\u2019 search space; ii) performs comparably to most end-to-end supervised methods. The results include studies on the importance of depth, hyper-parameters, and compression of the graph embeddings. We also introduce a novel approximated inference procedure that better deals with larger graph topologies.", "bibtex": "@InProceedings{pmlr-v162-castellana22a,\n title = \t {The Infinite Contextual Graph {M}arkov Model},\n author = {Castellana, Daniele and Errica, Federico and Bacciu, Davide and Micheli, Alessio},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2721--2737},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/castellana22a/castellana22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/castellana22a.html},\n abstract = \t {The Contextual Graph Markov Model (CGMM) is a deep, unsupervised, and probabilistic model for graphs that is trained incrementally on a layer-by-layer basis. As with most Deep Graph Networks, an inherent limitation is the need to perform an extensive model selection to choose the proper size of each layer\u2019s latent representation. In this paper, we address this problem by introducing the Infinite Contextual Graph Markov Model (iCGMM), the first deep Bayesian nonparametric model for graph learning. During training, iCGMM can adapt the complexity of each layer to better fit the underlying data distribution. On 8 graph classification tasks, we show that iCGMM: i) successfully recovers or improves CGMM\u2019s performances while reducing the hyper-parameters\u2019 search space; ii) performs comparably to most end-to-end supervised methods. The results include studies on the importance of depth, hyper-parameters, and compression of the graph embeddings. We also introduce a novel approximated inference procedure that better deals with larger graph topologies.}\n}", "pdf": "https://proceedings.mlr.press/v162/castellana22a/castellana22a.pdf", "supp": "", "pdf_size": 602558, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8605870071068450875&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, University of Pisa, Italy+NEC Laboratories Europe, Heidelberg, Germany; Department of Computer Science, University of Pisa, Italy+NEC Laboratories Europe, Heidelberg, Germany; Department of Computer Science, University of Pisa, Italy; Department of Computer Science, University of Pisa, Italy", "aff_domain": "di.unipi.it;neclab.eu; ; ", "email": "di.unipi.it;neclab.eu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/castellana22a.html", "aff_unique_index": "0+1;0+1;0;0", "aff_unique_norm": "University of Pisa;NEC Laboratories Europe", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.unipi.it;https://www.nec-labs.eu", "aff_unique_abbr": "UNIP;NEC Europe", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Heidelberg", "aff_country_unique_index": "0+1;0+1;0;0", "aff_country_unique": "Italy;Germany" }, { "title": "The Multivariate Community Hawkes Model for Dependent Relational Events in Continuous-time Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17097", "id": "17097", "proceeding": "https://proceedings.mlr.press/v162/soliman22a.html", "poster": "/media/PosterPDFs/ICML%202022/8ad56037830fcf5c6396aa69b1c252d0.png?t=1657645803.7041855", "slides": "", "author_site": "Hadeel Soliman, Lingfei Zhao, Zhipeng Huang, Subhadeep Paul, Kevin Xu", "author": "Hadeel Soliman; Lingfei Zhao; Zhipeng Huang; Subhadeep Paul; Kevin S Xu", "abstract": "The stochastic block model (SBM) is one of the most widely used generative models for network data. Many continuous-time dynamic network models are built upon the same assumption as the SBM: edges or events between all pairs of nodes are conditionally independent given the block or community memberships, which prevents them from reproducing higher-order motifs such as triangles that are commonly observed in real networks. We propose the multivariate community Hawkes (MULCH) model, an extremely flexible community-based model for continuous-time networks that introduces dependence between node pairs using structured multivariate Hawkes processes. We fit the model using a spectral clustering and likelihood-based local refinement procedure. We find that our proposed MULCH model is far more accurate than existing models both for predictive and generative tasks.", "bibtex": "@InProceedings{pmlr-v162-soliman22a,\n title = \t {The Multivariate Community {H}awkes Model for Dependent Relational Events in Continuous-time Networks},\n author = {Soliman, Hadeel and Zhao, Lingfei and Huang, Zhipeng and Paul, Subhadeep and Xu, Kevin S},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20329--20346},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/soliman22a/soliman22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/soliman22a.html},\n abstract = \t {The stochastic block model (SBM) is one of the most widely used generative models for network data. Many continuous-time dynamic network models are built upon the same assumption as the SBM: edges or events between all pairs of nodes are conditionally independent given the block or community memberships, which prevents them from reproducing higher-order motifs such as triangles that are commonly observed in real networks. We propose the multivariate community Hawkes (MULCH) model, an extremely flexible community-based model for continuous-time networks that introduces dependence between node pairs using structured multivariate Hawkes processes. We fit the model using a spectral clustering and likelihood-based local refinement procedure. We find that our proposed MULCH model is far more accurate than existing models both for predictive and generative tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/soliman22a/soliman22a.pdf", "supp": "", "pdf_size": 789656, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16117758994538292993&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Electrical Engineering and Computer Science, University of Toledo, Toledo, OH, USA+Department of Statistics, The Ohio State University, Columbus, OH, USA; Department of Statistics, The Ohio State University, Columbus, OH, USA; Department of Electrical Engineering and Computer Science, University of Toledo, Toledo, OH, USA+Department of Statistics, The Ohio State University, Columbus, OH, USA; Department of Statistics, The Ohio State University, Columbus, OH, USA; Department of Electrical Engineering and Computer Science, University of Toledo, Toledo, OH, USA", "aff_domain": "utoledo.edu;osu.edu;utoledo.edu;osu.edu;utoledo.edu", "email": "utoledo.edu;osu.edu;utoledo.edu;osu.edu;utoledo.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/soliman22a.html", "aff_unique_index": "0+1;1;0+1;1;0", "aff_unique_norm": "University of Toledo;Ohio State University", "aff_unique_dep": "Department of Electrical Engineering and Computer Science;Department of Statistics", "aff_unique_url": "https://www.utoledo.edu;https://www.osu.edu", "aff_unique_abbr": "UT;OSU", "aff_campus_unique_index": "0+1;1;0+1;1;0", "aff_campus_unique": "Toledo;Columbus", "aff_country_unique_index": "0+0;0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "The Neural Race Reduction: Dynamics of Abstraction in Gated Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17041", "id": "17041", "proceeding": "https://proceedings.mlr.press/v162/saxe22a.html", "poster": "/media/PosterPDFs/ICML%202022/624ec1c881656ee6418604df2928494b.png?t=1658345656.8977296", "slides": "", "author_site": "Andrew Saxe, Shagun Sodhani, Sam Lewallen", "author": "Andrew Saxe; Shagun Sodhani; Sam Jay Lewallen", "abstract": "Our theoretical understanding of deep learning has not kept pace with its empirical success. While network architecture is known to be critical, we do not yet understand its effect on learned representations and network behavior, or how this architecture should reflect task structure.In this work, we begin to address this gap by introducing the Gated Deep Linear Network framework that schematizes how pathways of information flow impact learning dynamics within an architecture. Crucially, because of the gating, these networks can compute nonlinear functions of their input. We derive an exact reduction and, for certain cases, exact solutions to the dynamics of learning. Our analysis demonstrates that the learning dynamics in structured networks can be conceptualized as a neural race with an implicit bias towards shared representations, which then govern the model\u2019s ability to systematically generalize, multi-task, and transfer. We validate our key insights on naturalistic datasets and with relaxed assumptions. Taken together, our work gives rise to general hypotheses relating neural architecture to learning and provides a mathematical approach towards understanding the design of more complex architectures and the role of modularity and compositionality in solving real-world problems. The code and results are available at https://www.saxelab.org/gated-dln.", "bibtex": "@InProceedings{pmlr-v162-saxe22a,\n title = \t {The Neural Race Reduction: Dynamics of Abstraction in Gated Networks},\n author = {Saxe, Andrew and Sodhani, Shagun and Lewallen, Sam Jay},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19287--19309},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/saxe22a/saxe22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/saxe22a.html},\n abstract = \t {Our theoretical understanding of deep learning has not kept pace with its empirical success. While network architecture is known to be critical, we do not yet understand its effect on learned representations and network behavior, or how this architecture should reflect task structure.In this work, we begin to address this gap by introducing the Gated Deep Linear Network framework that schematizes how pathways of information flow impact learning dynamics within an architecture. Crucially, because of the gating, these networks can compute nonlinear functions of their input. We derive an exact reduction and, for certain cases, exact solutions to the dynamics of learning. Our analysis demonstrates that the learning dynamics in structured networks can be conceptualized as a neural race with an implicit bias towards shared representations, which then govern the model\u2019s ability to systematically generalize, multi-task, and transfer. We validate our key insights on naturalistic datasets and with relaxed assumptions. Taken together, our work gives rise to general hypotheses relating neural architecture to learning and provides a mathematical approach towards understanding the design of more complex architectures and the role of modularity and compositionality in solving real-world problems. The code and results are available at https://www.saxelab.org/gated-dln.}\n}", "pdf": "https://proceedings.mlr.press/v162/saxe22a/saxe22a.pdf", "supp": "", "pdf_size": 3285732, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14334536452871565583&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "aff": "Gatsby Computational Neuroscience Unit & Sainsbury Wellcome Centre, UCL+FAIR, Meta AI+CIFAR Azrieli Global Scholar, CIFAR; FAIR, Meta AI; Gatsby Computational Neuroscience Unit & Sainsbury Wellcome Centre, UCL", "aff_domain": "ucl.ac.uk; ; ", "email": "ucl.ac.uk; ; ", "github": "", "project": "https://www.saxelab.org/gated-dln", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/saxe22a.html", "aff_unique_index": "0+1+2;1;0", "aff_unique_norm": "University College London;Meta;CIFAR", "aff_unique_dep": "Gatsby Computational Neuroscience Unit & Sainsbury Wellcome Centre;Meta AI;CIFAR Azrieli Global Scholar", "aff_unique_url": "https://www.ucl.ac.uk;https://meta.ai;https://www.cifar.ca", "aff_unique_abbr": "UCL;Meta AI;CIFAR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1+2;1;0", "aff_country_unique": "United Kingdom;United States;Canada" }, { "title": "The Poisson Binomial Mechanism for Unbiased Federated Learning with Secure Aggregation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17921", "id": "17921", "proceeding": "https://proceedings.mlr.press/v162/chen22s.html", "poster": "/media/PosterPDFs/ICML%202022/47951a40efc0d2f7da8ff1ecbfde80f4.png?t=1657647070.304331", "slides": "", "author_site": "Wei-Ning Chen, Ayfer Ozgur, Peter Kairouz", "author": "Wei-Ning Chen; Ayfer Ozgur; Peter Kairouz", "abstract": "We introduce the Poisson Binomial mechanism (PBM), a discrete differential privacy mechanism for distributed mean estimation (DME) with applications to federated learning and analytics. We provide a tight analysis of its privacy guarantees, showing that it achieves the same privacy-accuracy trade-offs as the continuous Gaussian mechanism. Our analysis is based on a novel bound on the R\u00e9nyi divergence of two Poisson binomial distributions that may be of independent interest. Unlike previous discrete DP schemes based on additive noise, our mechanism encodes local information into a parameter of the binomial distribution, and hence the output distribution is discrete with bounded support. Moreover, the support does not increase as the privacy budget goes to zero as in the case of additive schemes which require the addition of more noise to achieve higher privacy; on the contrary, the support becomes smaller as eps goes to zero. The bounded support enables us to combine our mechanism with secure aggregation (SecAgg), a multi-party cryptographic protocol, without the need of performing modular clipping which results in an unbiased estimator of the sum of the local vectors. This in turn allows us to apply it in the private FL setting and provide an upper bound on the convergence rate of the SGD algorithm. Moreover, since the support of the output distribution becomes smaller as $\\varepsilon \\ra 0$, the communication cost of our scheme decreases with the privacy constraint $\\varepsilon$, outperforming all previous distributed DP schemes based on additive noise in the high privacy or low communication regimes.", "bibtex": "@InProceedings{pmlr-v162-chen22s,\n title = \t {The Poisson Binomial Mechanism for Unbiased Federated Learning with Secure Aggregation},\n author = {Chen, Wei-Ning and Ozgur, Ayfer and Kairouz, Peter},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3490--3506},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22s/chen22s.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22s.html},\n abstract = \t {We introduce the Poisson Binomial mechanism (PBM), a discrete differential privacy mechanism for distributed mean estimation (DME) with applications to federated learning and analytics. We provide a tight analysis of its privacy guarantees, showing that it achieves the same privacy-accuracy trade-offs as the continuous Gaussian mechanism. Our analysis is based on a novel bound on the R\u00e9nyi divergence of two Poisson binomial distributions that may be of independent interest. Unlike previous discrete DP schemes based on additive noise, our mechanism encodes local information into a parameter of the binomial distribution, and hence the output distribution is discrete with bounded support. Moreover, the support does not increase as the privacy budget goes to zero as in the case of additive schemes which require the addition of more noise to achieve higher privacy; on the contrary, the support becomes smaller as eps goes to zero. The bounded support enables us to combine our mechanism with secure aggregation (SecAgg), a multi-party cryptographic protocol, without the need of performing modular clipping which results in an unbiased estimator of the sum of the local vectors. This in turn allows us to apply it in the private FL setting and provide an upper bound on the convergence rate of the SGD algorithm. Moreover, since the support of the output distribution becomes smaller as $\\varepsilon \\ra 0$, the communication cost of our scheme decreases with the privacy constraint $\\varepsilon$, outperforming all previous distributed DP schemes based on additive noise in the high privacy or low communication regimes.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22s/chen22s.pdf", "supp": "", "pdf_size": 1536098, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16823859671117204590&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Electrical Engineering, Stanford University; Department of Electrical Engineering, Stanford University; Google Research", "aff_domain": "stanford.edu; ; ", "email": "stanford.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/chen22s.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": "Department of Electrical Engineering;Google Research", "aff_unique_url": "https://www.stanford.edu;https://research.google", "aff_unique_abbr": "Stanford;Google Research", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "The Power of Exploiter: Provable Multi-Agent RL in Large State Spaces", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16957", "id": "16957", "proceeding": "https://proceedings.mlr.press/v162/jin22c.html", "poster": "/media/PosterPDFs/ICML%202022/f3173935ed8ac4bf073c1bcd63171f8a.png?t=1656558641.5381875", "slides": "", "author_site": "Chi Jin, Qinghua Liu, Tiancheng Yu", "author": "Chi Jin; Qinghua Liu; Tiancheng Yu", "abstract": "Modern reinforcement learning (RL) commonly engages practical problems with large state spaces, where function approximation must be deployed to approximate either the value function or the policy. While recent progresses in RL theory address a rich set of RL problems with general function approximation, such successes are mostly restricted to the single-agent setting. It remains elusive how to extend these results to multi-agent RL, especially in the face of new game-theoretical challenges. This paper considers two-player zero-sum Markov Games (MGs). We propose a new algorithm that can provably find the Nash equilibrium policy using a polynomial number of samples, for any MG with low", "bibtex": "@InProceedings{pmlr-v162-jin22c,\n title = \t {The Power of Exploiter: Provable Multi-Agent {RL} in Large State Spaces},\n author = {Jin, Chi and Liu, Qinghua and Yu, Tiancheng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10251--10279},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jin22c/jin22c.pdf},\n url = \t {https://proceedings.mlr.press/v162/jin22c.html},\n abstract = \t {Modern reinforcement learning (RL) commonly engages practical problems with large state spaces, where function approximation must be deployed to approximate either the value function or the policy. While recent progresses in RL theory address a rich set of RL problems with general function approximation, such successes are mostly restricted to the single-agent setting. It remains elusive how to extend these results to multi-agent RL, especially in the face of new game-theoretical challenges. This paper considers two-player zero-sum Markov Games (MGs). We propose a new algorithm that can provably find the Nash equilibrium policy using a polynomial number of samples, for any MG with low", "pdf": "https://proceedings.mlr.press/v162/jin22c/jin22c.pdf", "supp": "", "pdf_size": 416479, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7859714395115586271&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Princeton University; Princeton University; MIT", "aff_domain": "princeton.edu; ; ", "email": "princeton.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/jin22c.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Princeton University;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://web.mit.edu", "aff_unique_abbr": "Princeton;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "The Primacy Bias in Deep Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18407", "id": "18407", "proceeding": "https://proceedings.mlr.press/v162/nikishin22a.html", "poster": "/media/PosterPDFs/ICML%202022/007d4a1214289aea09b9759ae1324e96.png?t=1657669430.5789707", "slides": "", "author_site": "Evgenii Nikishin, Max Schwarzer, Pierluca D'Oro, Pierre-Luc Bacon, Aaron Courville", "author": "Evgenii Nikishin; Max Schwarzer; Pierluca D\u2019Oro; Pierre-Luc Bacon; Aaron Courville", "abstract": "This work identifies a common flaw of deep reinforcement learning (RL) algorithms: a tendency to rely on early interactions and ignore useful evidence encountered later. Because of training on progressively growing datasets, deep RL agents incur a risk of overfitting to earlier experiences, negatively affecting the rest of the learning process. Inspired by cognitive science, we refer to this effect as the primacy bias. Through a series of experiments, we dissect the algorithmic aspects of deep RL that exacerbate this bias. We then propose a simple yet generally-applicable mechanism that tackles the primacy bias by periodically resetting a part of the agent. We apply this mechanism to algorithms in both discrete (Atari 100k) and continuous action (DeepMind Control Suite) domains, consistently improving their performance.", "bibtex": "@InProceedings{pmlr-v162-nikishin22a,\n title = \t {The Primacy Bias in Deep Reinforcement Learning},\n author = {Nikishin, Evgenii and Schwarzer, Max and D'Oro, Pierluca and Bacon, Pierre-Luc and Courville, Aaron},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16828--16847},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nikishin22a/nikishin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nikishin22a.html},\n abstract = \t {This work identifies a common flaw of deep reinforcement learning (RL) algorithms: a tendency to rely on early interactions and ignore useful evidence encountered later. Because of training on progressively growing datasets, deep RL agents incur a risk of overfitting to earlier experiences, negatively affecting the rest of the learning process. Inspired by cognitive science, we refer to this effect as the primacy bias. Through a series of experiments, we dissect the algorithmic aspects of deep RL that exacerbate this bias. We then propose a simple yet generally-applicable mechanism that tackles the primacy bias by periodically resetting a part of the agent. We apply this mechanism to algorithms in both discrete (Atari 100k) and continuous action (DeepMind Control Suite) domains, consistently improving their performance.}\n}", "pdf": "https://proceedings.mlr.press/v162/nikishin22a/nikishin22a.pdf", "supp": "", "pdf_size": 2900383, "gs_citation": 216, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11620338198970862085&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Mila, Universit \u00b4e de Montr \u00b4eal; Mila, Universit \u00b4e de Montr \u00b4eal; Mila, Universit \u00b4e de Montr \u00b4eal; Mila, Universit \u00b4e de Montr \u00b4eal; Mila, Universit \u00b4e de Montr \u00b4eal", "aff_domain": "mila.quebec; ; ; ; ", "email": "mila.quebec; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/nikishin22a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": "Mila", "aff_unique_url": "https://www.mila.quebec", "aff_unique_abbr": "Mila", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Montr\u00e9al", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Canada" }, { "title": "The Role of Deconfounding in Meta-learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17169", "id": "17169", "proceeding": "https://proceedings.mlr.press/v162/jiang22a.html", "poster": "/media/PosterPDFs/ICML%202022/19f01591b6ca3ba03f1aedc8db12cdb9.png?t=1658069299.4312537", "slides": "", "author_site": "Yinjie Jiang, Zhengyu Chen, Kun Kuang, Luotian Yuan, Xinhai Ye, Zhihua Wang, Fei Wu, Ying WEI", "author": "Yinjie Jiang; Zhengyu Chen; Kun Kuang; Luotian Yuan; Xinhai Ye; Zhihua Wang; Fei Wu; Ying Wei", "abstract": "Meta-learning has emerged as a potent paradigm for quick learning of few-shot tasks, by leveraging the meta-knowledge learned from meta-training tasks. Well-generalized meta-knowledge that facilitates fast adaptation in each task is preferred; however, recent evidence suggests the undesirable memorization effect where the meta-knowledge simply memorizing all meta-training tasks discourages task-specific adaptation and poorly generalizes. There have been several solutions to mitigating the effect, including both regularizer-based and augmentation-based methods, while a systematic understanding of these methods in a single framework is still lacking. In this paper, we offer a novel causal perspective of meta-learning. Through the lens of causality, we conclude the universal label space as a confounder to be the causing factor of memorization and frame the two lines of prevailing methods as different deconfounder approaches. Remarkably, derived from the causal inference principle of front-door adjustment, we propose two frustratingly easy but effective deconfounder algorithms, i.e., sampling multiple versions of the meta-knowledge via Dropout and grouping the meta-knowledge into multiple bins. The proposed causal perspective not only brings in the two deconfounder algorithms that surpass previous works in four benchmark datasets towards combating memorization, but also opens a promising direction for meta-learning.", "bibtex": "@InProceedings{pmlr-v162-jiang22a,\n title = \t {The Role of Deconfounding in Meta-learning},\n author = {Jiang, Yinjie and Chen, Zhengyu and Kuang, Kun and Yuan, Luotian and Ye, Xinhai and Wang, Zhihua and Wu, Fei and Wei, Ying},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10161--10176},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jiang22a/jiang22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jiang22a.html},\n abstract = \t {Meta-learning has emerged as a potent paradigm for quick learning of few-shot tasks, by leveraging the meta-knowledge learned from meta-training tasks. Well-generalized meta-knowledge that facilitates fast adaptation in each task is preferred; however, recent evidence suggests the undesirable memorization effect where the meta-knowledge simply memorizing all meta-training tasks discourages task-specific adaptation and poorly generalizes. There have been several solutions to mitigating the effect, including both regularizer-based and augmentation-based methods, while a systematic understanding of these methods in a single framework is still lacking. In this paper, we offer a novel causal perspective of meta-learning. Through the lens of causality, we conclude the universal label space as a confounder to be the causing factor of memorization and frame the two lines of prevailing methods as different deconfounder approaches. Remarkably, derived from the causal inference principle of front-door adjustment, we propose two frustratingly easy but effective deconfounder algorithms, i.e., sampling multiple versions of the meta-knowledge via Dropout and grouping the meta-knowledge into multiple bins. The proposed causal perspective not only brings in the two deconfounder algorithms that surpass previous works in four benchmark datasets towards combating memorization, but also opens a promising direction for meta-learning.}\n}", "pdf": "https://proceedings.mlr.press/v162/jiang22a/jiang22a.pdf", "supp": "", "pdf_size": 561715, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2615524669736704689&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "email": ";;;;;;;", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/jiang22a.html" }, { "title": "The State of Sparse Training in Deep Reinforcement Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16713", "id": "16713", "proceeding": "https://proceedings.mlr.press/v162/graesser22a.html", "poster": "/media/PosterPDFs/ICML%202022/d0353558f3ae8b91febe82f5a735bb06_xwmxc87.png?t=1657222583.0615218", "slides": "", "author_site": "Laura Graesser, Utku Evci, Erich Elsen, Pablo Samuel Castro", "author": "Laura Graesser; Utku Evci; Erich Elsen; Pablo Samuel Castro", "abstract": "The use of sparse neural networks has seen rapid growth in recent years, particularly in computer vision. Their appeal stems largely from the reduced number of parameters required to train and store, as well as in an increase in learning efficiency. Somewhat surprisingly, there have been very few efforts exploring their use in Deep Reinforcement Learning (DRL). In this work we perform a systematic investigation into applying a number of existing sparse training techniques on a variety of DRL agents and environments. Our results corroborate the findings from sparse training in the computer vision domain {\u2013}sparse networks perform better than dense networks for the same parameter count{\u2013} in the DRL domain. We provide detailed analyses on how the various components in DRL are affected by the use of sparse networks and conclude by suggesting promising avenues for improving the effectiveness of sparse training methods, as well as for advancing their use in DRL.", "bibtex": "@InProceedings{pmlr-v162-graesser22a,\n title = \t {The State of Sparse Training in Deep Reinforcement Learning},\n author = {Graesser, Laura and Evci, Utku and Elsen, Erich and Castro, Pablo Samuel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7766--7792},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/graesser22a/graesser22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/graesser22a.html},\n abstract = \t {The use of sparse neural networks has seen rapid growth in recent years, particularly in computer vision. Their appeal stems largely from the reduced number of parameters required to train and store, as well as in an increase in learning efficiency. Somewhat surprisingly, there have been very few efforts exploring their use in Deep Reinforcement Learning (DRL). In this work we perform a systematic investigation into applying a number of existing sparse training techniques on a variety of DRL agents and environments. Our results corroborate the findings from sparse training in the computer vision domain {\u2013}sparse networks perform better than dense networks for the same parameter count{\u2013} in the DRL domain. We provide detailed analyses on how the various components in DRL are affected by the use of sparse networks and conclude by suggesting promising avenues for improving the effectiveness of sparse training methods, as well as for advancing their use in DRL.}\n}", "pdf": "https://proceedings.mlr.press/v162/graesser22a/graesser22a.pdf", "supp": "", "pdf_size": 1525736, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14192241169010551512&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Robotics at Google+Google Research, Canada; Google Research, Canada; Adept; Google Research, Canada", "aff_domain": "google.com;google.com; ;google.com", "email": "google.com;google.com; ;google.com", "github": "github.com/google-research/rigl/tree/master/rigl/rl", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/graesser22a.html", "aff_unique_index": "0+0;0;1;0", "aff_unique_norm": "Google;Adept", "aff_unique_dep": "Robotics;", "aff_unique_url": "https://www.google.com;", "aff_unique_abbr": "Google Robotics;", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0+1;1;1", "aff_country_unique": "United States;Canada;" }, { "title": "The Teaching Dimension of Regularized Kernel Learners", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18039", "id": "18039", "proceeding": "https://proceedings.mlr.press/v162/qian22a.html", "poster": "/media/PosterPDFs/ICML%202022/4efb80f630ccecb2d3b9b2087b0f9c89.png?t=1657625605.4132395", "slides": "/media/icml-2022/Slides/18039.pdf", "author_site": "Hong Qian, Xu-Hui Liu, Chen-Xi Su, Aimin Zhou, Yang Yu", "author": "Hong Qian; Xu-Hui Liu; Chen-Xi Su; Aimin Zhou; Yang Yu", "abstract": "Teaching dimension (TD) is a fundamental theoretical property for understanding machine teaching algorithms. It measures the sample complexity of teaching a target hypothesis to a learner. The TD of linear learners has been studied extensively, whereas the results of teaching non-linear learners are rare. A recent result investigates the TD of polynomial and Gaussian kernel learners. Unfortunately, the theoretical bounds therein show that the TD is high when teaching those non-linear learners. Inspired by the fact that regularization can reduce the learning complexity in machine learning, a natural question is whether the similar fact happens in machine teaching. To answer this essential question, this paper proposes a unified theoretical framework termed STARKE to analyze the TD of regularized kernel learners. On the basis of STARKE, we derive a generic result of any type of kernels. Furthermore, we disclose that the TD of regularized linear and regularized polynomial kernel learners can be strictly reduced. For regularized Gaussian kernel learners, we reveal that, although their TD is infinite, their epsilon-approximate TD can be exponentially reduced compared with that of the unregularized learners. The extensive experimental results of teaching the optimization-based learners verify the theoretical findings.", "bibtex": "@InProceedings{pmlr-v162-qian22a,\n title = \t {The Teaching Dimension of Regularized Kernel Learners},\n author = {Qian, Hong and Liu, Xu-Hui and Su, Chen-Xi and Zhou, Aimin and Yu, Yang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17984--18002},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/qian22a/qian22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/qian22a.html},\n abstract = \t {Teaching dimension (TD) is a fundamental theoretical property for understanding machine teaching algorithms. It measures the sample complexity of teaching a target hypothesis to a learner. The TD of linear learners has been studied extensively, whereas the results of teaching non-linear learners are rare. A recent result investigates the TD of polynomial and Gaussian kernel learners. Unfortunately, the theoretical bounds therein show that the TD is high when teaching those non-linear learners. Inspired by the fact that regularization can reduce the learning complexity in machine learning, a natural question is whether the similar fact happens in machine teaching. To answer this essential question, this paper proposes a unified theoretical framework termed STARKE to analyze the TD of regularized kernel learners. On the basis of STARKE, we derive a generic result of any type of kernels. Furthermore, we disclose that the TD of regularized linear and regularized polynomial kernel learners can be strictly reduced. For regularized Gaussian kernel learners, we reveal that, although their TD is infinite, their epsilon-approximate TD can be exponentially reduced compared with that of the unregularized learners. The extensive experimental results of teaching the optimization-based learners verify the theoretical findings.}\n}", "pdf": "https://proceedings.mlr.press/v162/qian22a/qian22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/qian22a-supp.zip", "pdf_size": 936979, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18410286521267047669&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "School of Computer Science and Technology, East China Normal University, Shanghai, China + Shanghai Key Laboratory of Multidimensional Information Processing; School of Artificial Intelligence, Nanjing University, Nanjing, China + National Key Laboratory for Novel Software Technology; School of Computer Science and Technology, East China Normal University, Shanghai, China; School of Computer Science and Technology, East China Normal University, Shanghai, China + Shanghai Institute of AI for Education; School of Artificial Intelligence, Nanjing University, Nanjing, China + National Key Laboratory for Novel Software Technology", "aff_domain": "cs.ecnu.edu.cn; ; ; ; ", "email": "cs.ecnu.edu.cn; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/qian22a.html", "aff_unique_index": "0+1;2+3;0;0+4;2+3", "aff_unique_norm": "East China Normal University;Shanghai Key Laboratory of Multidimensional Information Processing;Nanjing University;National Key Laboratory for Novel Software Technology;Shanghai Institute of AI for Education", "aff_unique_dep": "School of Computer Science and Technology;Multidimensional Information Processing;School of Artificial Intelligence;;", "aff_unique_url": "http://www.ecnu.edu.cn;;http://www.nju.edu.cn;;", "aff_unique_abbr": "ECNU;;Nanjing U;;", "aff_campus_unique_index": "0;2;0;0;2", "aff_campus_unique": "Shanghai;;Nanjing", "aff_country_unique_index": "0+0;0+0;0;0+0;0+0", "aff_country_unique": "China" }, { "title": "The Unsurprising Effectiveness of Pre-Trained Vision Models for Control", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17347", "id": "17347", "proceeding": "https://proceedings.mlr.press/v162/parisi22a.html", "poster": "/media/PosterPDFs/ICML%202022/9d702ffd99ad9c70ac37e506facc8c38.png?t=1657233933.8483586", "slides": "/media/icml-2022/Slides/17347.pdf", "author_site": "Simone Parisi, Aravind Rajeswaran, Senthil Purushwalkam, Abhinav Gupta", "author": "Simone Parisi; Aravind Rajeswaran; Senthil Purushwalkam; Abhinav Gupta", "abstract": "Recent years have seen the emergence of pre-trained representations as a powerful abstraction for AI applications in computer vision, natural language, and speech. However, policy learning for control is still dominated by a tabula-rasa learning paradigm, with visuo-motor policies often trained from scratch using data from deployment environments. In this context, we revisit and study the role of pre-trained visual representations for control, and in particular representations trained on large-scale computer vision datasets. Through extensive empirical evaluation in diverse control domains (Habitat, DeepMind Control, Adroit, Franka Kitchen), we isolate and study the importance of different representation training methods, data augmentations, and feature hierarchies. Overall, we find that pre-trained visual representations can be competitive or even better than ground-truth state representations to train control policies. This is in spite of using only out-of-domain data from standard vision datasets, without any in-domain data from the deployment environments.", "bibtex": "@InProceedings{pmlr-v162-parisi22a,\n title = \t {The Unsurprising Effectiveness of Pre-Trained Vision Models for Control},\n author = {Parisi, Simone and Rajeswaran, Aravind and Purushwalkam, Senthil and Gupta, Abhinav},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17359--17371},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/parisi22a/parisi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/parisi22a.html},\n abstract = \t {Recent years have seen the emergence of pre-trained representations as a powerful abstraction for AI applications in computer vision, natural language, and speech. However, policy learning for control is still dominated by a tabula-rasa learning paradigm, with visuo-motor policies often trained from scratch using data from deployment environments. In this context, we revisit and study the role of pre-trained visual representations for control, and in particular representations trained on large-scale computer vision datasets. Through extensive empirical evaluation in diverse control domains (Habitat, DeepMind Control, Adroit, Franka Kitchen), we isolate and study the importance of different representation training methods, data augmentations, and feature hierarchies. Overall, we find that pre-trained visual representations can be competitive or even better than ground-truth state representations to train control policies. This is in spite of using only out-of-domain data from standard vision datasets, without any in-domain data from the deployment environments.}\n}", "pdf": "https://proceedings.mlr.press/v162/parisi22a/parisi22a.pdf", "supp": "", "pdf_size": 4240045, "gs_citation": 212, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2835376063938188709&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Meta AI+1; Meta AI+1; Carnegie Mellon University+2; Meta AI+1+Carnegie Mellon University+2", "aff_domain": "robot-learning.de;fb.com; ; ", "email": "robot-learning.de;fb.com; ; ", "github": "", "project": "https://sites.google.com/view/pvr-control", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/parisi22a.html", "aff_unique_index": "0;0;2;0+2", "aff_unique_norm": "Meta;;Carnegie Mellon University", "aff_unique_dep": "Meta AI;;", "aff_unique_url": "https://meta.com;;https://www.cmu.edu", "aff_unique_abbr": "Meta;;CMU", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", "aff_country_unique": "United States;" }, { "title": "The dynamics of representation learning in shallow, non-linear autoencoders", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/15961", "id": "15961", "proceeding": "https://proceedings.mlr.press/v162/refinetti22a.html", "poster": "", "slides": "", "author_site": "Maria Refinetti, Sebastian Goldt", "author": "Maria Refinetti; Sebastian Goldt", "abstract": "Autoencoders are the simplest neural network for unsupervised learning, and thus an ideal framework for studying feature learning. While a detailed understanding of the dynamics of linear autoencoders has recently been obtained, the study of non-linear autoencoders has been hindered by the technical difficulty of handling training data with non-trivial correlations {\u2013} a fundamental prerequisite for feature extraction. Here, we study the dynamics of feature learning in non-linear, shallow autoencoders. We derive a set of asymptotically exact equations that describe the generalisation dynamics of autoencoders trained with stochastic gradient descent (SGD) in the limit of high-dimensional inputs. These equations reveal that autoencoders learn the leading principal components of their inputs sequentially. An analysis of the long-time dynamics explains the failure of sigmoidal autoencoders to learn with tied weights, and highlights the importance of training the bias in ReLU autoencoders. Building on previous results for linear networks, we analyse a modification of the vanilla SGD algorithm which allows learning of the exact principal components. Finally, we show that our equations accurately describe the generalisation dynamics of non-linear autoencoders on realistic datasets such as CIFAR10.", "bibtex": "@InProceedings{pmlr-v162-refinetti22a,\n title = \t {The dynamics of representation learning in shallow, non-linear autoencoders},\n author = {Refinetti, Maria and Goldt, Sebastian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18499--18519},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/refinetti22a/refinetti22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/refinetti22a.html},\n abstract = \t {Autoencoders are the simplest neural network for unsupervised learning, and thus an ideal framework for studying feature learning. While a detailed understanding of the dynamics of linear autoencoders has recently been obtained, the study of non-linear autoencoders has been hindered by the technical difficulty of handling training data with non-trivial correlations {\u2013} a fundamental prerequisite for feature extraction. Here, we study the dynamics of feature learning in non-linear, shallow autoencoders. We derive a set of asymptotically exact equations that describe the generalisation dynamics of autoencoders trained with stochastic gradient descent (SGD) in the limit of high-dimensional inputs. These equations reveal that autoencoders learn the leading principal components of their inputs sequentially. An analysis of the long-time dynamics explains the failure of sigmoidal autoencoders to learn with tied weights, and highlights the importance of training the bias in ReLU autoencoders. Building on previous results for linear networks, we analyse a modification of the vanilla SGD algorithm which allows learning of the exact principal components. Finally, we show that our equations accurately describe the generalisation dynamics of non-linear autoencoders on realistic datasets such as CIFAR10.}\n}", "pdf": "https://proceedings.mlr.press/v162/refinetti22a/refinetti22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/refinetti22a-supp.zip", "pdf_size": 1396273, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14118431460184328977&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Physics, Ecole Normale Sup\u00e9rieure, Paris, France+IdePHICS laboratory, EPFL, Lausanne, Switzerland; International School of Advanced Studies (SISSA), Trieste, Italy", "aff_domain": "ens.fr;sissa.it", "email": "ens.fr;sissa.it", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/refinetti22a.html", "aff_unique_index": "0+1;2", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure;EPFL;International School of Advanced Studies", "aff_unique_dep": "Department of Physics;IdePHICS laboratory;", "aff_unique_url": "https://www.ens.fr;https://www.epfl.ch;https://www.sissa.it", "aff_unique_abbr": "ENS;EPFL;SISSA", "aff_campus_unique_index": "0+1;2", "aff_campus_unique": "Paris;Lausanne;Trieste", "aff_country_unique_index": "0+1;2", "aff_country_unique": "France;Switzerland;Italy" }, { "title": "The power of first-order smooth optimization for black-box non-smooth problems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18127", "id": "18127", "proceeding": "https://proceedings.mlr.press/v162/gasnikov22a.html", "poster": "/media/PosterPDFs/ICML%202022/41e7637e7b6a9f27a98b84d3a185c7c0.png?t=1657728319.9607477", "slides": "", "author_site": "Alexander Gasnikov, Anton Novitskii, Vasilii Novitskii, Farshed Abdukhakimov, Dmitry Kamzolov, Aleksandr Beznosikov, Martin Takac, Pavel Dvurechenskii, Bin Gu", "author": "Alexander Gasnikov; Anton Novitskii; Vasilii Novitskii; Farshed Abdukhakimov; Dmitry Kamzolov; Aleksandr Beznosikov; Martin Takac; Pavel Dvurechensky; Bin Gu", "abstract": "Gradient-free/zeroth-order methods for black-box convex optimization have been extensively studied in the last decade with the main focus on oracle calls complexity. In this paper, besides the oracle complexity, we focus also on iteration complexity, and propose a generic approach that, based on optimal first-order methods, allows to obtain in a black-box fashion new zeroth-order algorithms for non-smooth convex optimization problems. Our approach not only leads to optimal oracle complexity, but also allows to obtain iteration complexity similar to first-order methods, which, in turn, allows to exploit parallel computations to accelerate the convergence of our algorithms. We also elaborate on extensions for stochastic optimization problems, saddle-point problems, and distributed optimization.", "bibtex": "@InProceedings{pmlr-v162-gasnikov22a,\n title = \t {The power of first-order smooth optimization for black-box non-smooth problems},\n author = {Gasnikov, Alexander and Novitskii, Anton and Novitskii, Vasilii and Abdukhakimov, Farshed and Kamzolov, Dmitry and Beznosikov, Aleksandr and Takac, Martin and Dvurechensky, Pavel and Gu, Bin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7241--7265},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gasnikov22a/gasnikov22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gasnikov22a.html},\n abstract = \t {Gradient-free/zeroth-order methods for black-box convex optimization have been extensively studied in the last decade with the main focus on oracle calls complexity. In this paper, besides the oracle complexity, we focus also on iteration complexity, and propose a generic approach that, based on optimal first-order methods, allows to obtain in a black-box fashion new zeroth-order algorithms for non-smooth convex optimization problems. Our approach not only leads to optimal oracle complexity, but also allows to obtain iteration complexity similar to first-order methods, which, in turn, allows to exploit parallel computations to accelerate the convergence of our algorithms. We also elaborate on extensions for stochastic optimization problems, saddle-point problems, and distributed optimization.}\n}", "pdf": "https://proceedings.mlr.press/v162/gasnikov22a/gasnikov22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/gasnikov22a-supp.zip", "pdf_size": 9900038, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18402722015313584669&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "github": "", "project": "", "author_num": 9, "oa": "https://proceedings.mlr.press/v162/gasnikov22a.html" }, { "title": "Thompson Sampling for (Combinatorial) Pure Exploration", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17555", "id": "17555", "proceeding": "https://proceedings.mlr.press/v162/wang22as.html", "poster": "/media/PosterPDFs/ICML%202022/6f3e29a35278d71c7f65495871231324.png?t=1657459866.6590614", "slides": "", "author_site": "Siwei Wang, Jun Zhu", "author": "Siwei Wang; Jun Zhu", "abstract": "Existing methods of combinatorial pure exploration mainly focus on the UCB approach. To make the algorithm efficient, they usually use the sum of upper confidence bounds within arm set $S$ to represent the upper confidence bound of $S$, which can be much larger than the tight upper confidence bound of $S$ and leads to a much higher complexity than necessary, since the empirical means of different arms in $S$ are independent. To deal with this challenge, we explore the idea of Thompson Sampling (TS) that uses independent random samples instead of the upper confidence bounds, and design the first TS-based algorithm TS-Explore for (combinatorial) pure exploration. In TS-Explore, the sum of independent random samples within arm set $S$ will not exceed the tight upper confidence bound of $S$ with high probability. Hence it solves the above challenge, and achieves a lower complexity upper bound than existing efficient UCB-based algorithms in general combinatorial pure exploration. As for pure exploration of classic multi-armed bandit, we show that TS-Explore achieves an asymptotically optimal complexity upper bound.", "bibtex": "@InProceedings{pmlr-v162-wang22as,\n title = \t {Thompson Sampling for ({C}ombinatorial) Pure Exploration},\n author = {Wang, Siwei and Zhu, Jun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23470--23483},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22as/wang22as.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22as.html},\n abstract = \t {Existing methods of combinatorial pure exploration mainly focus on the UCB approach. To make the algorithm efficient, they usually use the sum of upper confidence bounds within arm set $S$ to represent the upper confidence bound of $S$, which can be much larger than the tight upper confidence bound of $S$ and leads to a much higher complexity than necessary, since the empirical means of different arms in $S$ are independent. To deal with this challenge, we explore the idea of Thompson Sampling (TS) that uses independent random samples instead of the upper confidence bounds, and design the first TS-based algorithm TS-Explore for (combinatorial) pure exploration. In TS-Explore, the sum of independent random samples within arm set $S$ will not exceed the tight upper confidence bound of $S$ with high probability. Hence it solves the above challenge, and achieves a lower complexity upper bound than existing efficient UCB-based algorithms in general combinatorial pure exploration. As for pure exploration of classic multi-armed bandit, we show that TS-Explore achieves an asymptotically optimal complexity upper bound.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22as/wang22as.pdf", "supp": "", "pdf_size": 368999, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1750323348048628899&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Dept. of Comp. Sci. & Tech., BNRist Center, Tsinghua-Bosch Joint ML Center, Tsinghua University; Dept. of Comp. Sci. & Tech., BNRist Center, Tsinghua-Bosch Joint ML Center, Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn", "email": "tsinghua.edu.cn;tsinghua.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/wang22as.html", "aff_unique_index": "0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Department of Computer Science and Technology", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Thompson Sampling for Robust Transfer in Multi-Task Bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16877", "id": "16877", "proceeding": "https://proceedings.mlr.press/v162/wang22an.html", "poster": "/media/PosterPDFs/ICML%202022/02f039058bd48307e6f653a2005c9dd2.png?t=1658100022.524182", "slides": "", "author_site": "Zhi Wang, Chicheng Zhang, Kamalika Chaudhuri", "author": "Zhi Wang; Chicheng Zhang; Kamalika Chaudhuri", "abstract": "We study the problem of online multi-task learning where the tasks are performed within similar but not necessarily identical multi-armed bandit environments. In particular, we study how a learner can improve its overall performance across multiple related tasks through robust transfer of knowledge. While an upper confidence bound (UCB)-based algorithm has recently been shown to achieve nearly-optimal performance guarantees in a setting where all tasks are solved concurrently, it remains unclear whether Thompson sampling (TS) algorithms, which have superior empirical performance in general, share similar theoretical properties. In this work, we present a TS-type algorithm for a more general online multi-task learning protocol, which extends the concurrent setting. We provide its frequentist analysis and prove that it is also nearly-optimal using a novel concentration inequality for multi-task data aggregation at random stopping times. Finally, we evaluate the algorithm on synthetic data and show that the TS-type algorithm enjoys superior empirical performance in comparison with the UCB-based algorithm and a baseline algorithm that performs TS for each individual task without transfer.", "bibtex": "@InProceedings{pmlr-v162-wang22an,\n title = \t {Thompson Sampling for Robust Transfer in Multi-Task Bandits},\n author = {Wang, Zhi and Zhang, Chicheng and Chaudhuri, Kamalika},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23363--23416},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22an/wang22an.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22an.html},\n abstract = \t {We study the problem of online multi-task learning where the tasks are performed within similar but not necessarily identical multi-armed bandit environments. In particular, we study how a learner can improve its overall performance across multiple related tasks through robust transfer of knowledge. While an upper confidence bound (UCB)-based algorithm has recently been shown to achieve nearly-optimal performance guarantees in a setting where all tasks are solved concurrently, it remains unclear whether Thompson sampling (TS) algorithms, which have superior empirical performance in general, share similar theoretical properties. In this work, we present a TS-type algorithm for a more general online multi-task learning protocol, which extends the concurrent setting. We provide its frequentist analysis and prove that it is also nearly-optimal using a novel concentration inequality for multi-task data aggregation at random stopping times. Finally, we evaluate the algorithm on synthetic data and show that the TS-type algorithm enjoys superior empirical performance in comparison with the UCB-based algorithm and a baseline algorithm that performs TS for each individual task without transfer.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22an/wang22an.pdf", "supp": "", "pdf_size": 11188145, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9498764153726193190&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "University of California San Diego; University of Arizona; Facebook AI Research", "aff_domain": "ucsd.edu; ; ", "email": "ucsd.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22an.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, San Diego;University of Arizona;Meta", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://ucsd.edu;https://www.arizona.edu;https://research.facebook.com", "aff_unique_abbr": "UCSD;UA;FAIR", "aff_campus_unique_index": "0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Three-stage Evolution and Fast Equilibrium for SGD with Non-degerate Critical Points", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16527", "id": "16527", "proceeding": "https://proceedings.mlr.press/v162/wang22ab.html", "poster": "/media/PosterPDFs/ICML%202022/33b879e7ab79f56af1e88359f9314a10.png?t=1657571652.1750314", "slides": "", "author_site": "Yi Wang, Zhiren Wang", "author": "Yi Wang; Zhiren Wang", "abstract": "We justify the fast equilibrium conjecture on stochastic gradient descent from (Li et al. 2020) under the assumptions that critical points are non-degenerate and the stochastic noise is a standard Gaussian. In this case, we prove an SGD with constant effective learning rate consists of three stages: descent, diffusion and tunneling, and explicitly identify temporary equilibrium states in the normalized parameter space that can be observed within practical training time. This interprets the gap between the mixing time in the fast equilibrium conjecture and the previously known upper bound. While our assumptions do not represent typical implementations of SGD of neural networks in practice, this is the first description of the three-stage mechanism in any case. The main finding in this mechanism is that a temporary equilibrium of local nature is quickly achieved after polynomial time (in term of the reciprocal of the intrinsic learning rate) and then stabilizes within observable time scales; and that the temporary equilibrium is in general different from the global Gibbs equilibrium, which will only appear after an exponentially long period beyond typical training limits. Our experiments support that this mechanism may extend to the general case.", "bibtex": "@InProceedings{pmlr-v162-wang22ab,\n title = \t {Three-stage Evolution and Fast Equilibrium for {SGD} with Non-degerate Critical Points},\n author = {Wang, Yi and Wang, Zhiren},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23092--23113},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22ab/wang22ab.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22ab.html},\n abstract = \t {We justify the fast equilibrium conjecture on stochastic gradient descent from (Li et al. 2020) under the assumptions that critical points are non-degenerate and the stochastic noise is a standard Gaussian. In this case, we prove an SGD with constant effective learning rate consists of three stages: descent, diffusion and tunneling, and explicitly identify temporary equilibrium states in the normalized parameter space that can be observed within practical training time. This interprets the gap between the mixing time in the fast equilibrium conjecture and the previously known upper bound. While our assumptions do not represent typical implementations of SGD of neural networks in practice, this is the first description of the three-stage mechanism in any case. The main finding in this mechanism is that a temporary equilibrium of local nature is quickly achieved after polynomial time (in term of the reciprocal of the intrinsic learning rate) and then stabilizes within observable time scales; and that the temporary equilibrium is in general different from the global Gibbs equilibrium, which will only appear after an exponentially long period beyond typical training limits. Our experiments support that this mechanism may extend to the general case.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22ab/wang22ab.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wang22ab-supp.zip", "pdf_size": 1220697, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6144358979432257648&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Mathematics, Johns Hopkins University; Department of Mathematics, Pennsylvania State University", "aff_domain": "math.jhu.edu;psu.edu", "email": "math.jhu.edu;psu.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/wang22ab.html", "aff_unique_index": "0;1", "aff_unique_norm": "Johns Hopkins University;Pennsylvania State University", "aff_unique_dep": "Department of Mathematics;Department of Mathematics", "aff_unique_url": "https://www.jhu.edu;https://www.psu.edu", "aff_unique_abbr": "JHU;PSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Thresholded Lasso Bandit", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18277", "id": "18277", "proceeding": "https://proceedings.mlr.press/v162/ariu22a.html", "poster": "/media/PosterPDFs/ICML%202022/ea5d2f1c4608232e07d3aa3d998e5135.png?t=1657787622.8468766", "slides": "", "author_site": "Kaito Ariu, Kenshi Abe, Alexandre Proutiere", "author": "Kaito Ariu; Kenshi Abe; Alexandre Proutiere", "abstract": "In this paper, we revisit the regret minimization problem in sparse stochastic contextual linear bandits, where feature vectors may be of large dimension $d$, but where the reward function depends on a few, say $s_0\\ll d$, of these features only. We present Thresholded Lasso bandit, an algorithm that (i) estimates the vector defining the reward function as well as its sparse support, i.e., significant feature elements, using the Lasso framework with thresholding, and (ii) selects an arm greedily according to this estimate projected on its support. The algorithm does not require prior knowledge of the sparsity index $s_0$ and can be parameter-free under some symmetric assumptions. For this simple algorithm, we establish non-asymptotic regret upper bounds scaling as $\\mathcal{O}( \\log d + \\sqrt{T} )$ in general, and as $\\mathcal{O}( \\log d + \\log T)$ under the so-called margin condition (a probabilistic condition on the separation of the arm rewards). The regret of previous algorithms scales as $\\mathcal{O}( \\log d + \\sqrt{T \\log (d T)})$ and $\\mathcal{O}( \\log T \\log d)$ in the two settings, respectively. Through numerical experiments, we confirm that our algorithm outperforms existing methods.", "bibtex": "@InProceedings{pmlr-v162-ariu22a,\n title = \t {Thresholded Lasso Bandit},\n author = {Ariu, Kaito and Abe, Kenshi and Proutiere, Alexandre},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {878--928},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ariu22a/ariu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ariu22a.html},\n abstract = \t {In this paper, we revisit the regret minimization problem in sparse stochastic contextual linear bandits, where feature vectors may be of large dimension $d$, but where the reward function depends on a few, say $s_0\\ll d$, of these features only. We present Thresholded Lasso bandit, an algorithm that (i) estimates the vector defining the reward function as well as its sparse support, i.e., significant feature elements, using the Lasso framework with thresholding, and (ii) selects an arm greedily according to this estimate projected on its support. The algorithm does not require prior knowledge of the sparsity index $s_0$ and can be parameter-free under some symmetric assumptions. For this simple algorithm, we establish non-asymptotic regret upper bounds scaling as $\\mathcal{O}( \\log d + \\sqrt{T} )$ in general, and as $\\mathcal{O}( \\log d + \\log T)$ under the so-called margin condition (a probabilistic condition on the separation of the arm rewards). The regret of previous algorithms scales as $\\mathcal{O}( \\log d + \\sqrt{T \\log (d T)})$ and $\\mathcal{O}( \\log T \\log d)$ in the two settings, respectively. Through numerical experiments, we confirm that our algorithm outperforms existing methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/ariu22a/ariu22a.pdf", "supp": "", "pdf_size": 5946792, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2549693999294336180&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "EECS and Digital Futures, KTH Royal Institute of Technology, Stockholm, Sweden+Cyberagent, Inc., Tokyo, Japan; Cyberagent, Inc., Tokyo, Japan; EECS and Digital Futures, KTH Royal Institute of Technology, Stockholm, Sweden", "aff_domain": "kth.se; ; ", "email": "kth.se; ; ", "github": "https://github.com/CyberAgentAILab/thresholded-lasso-bandit", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/ariu22a.html", "aff_unique_index": "0+1;1;0", "aff_unique_norm": "KTH Royal Institute of Technology;CyberAgent, Inc.", "aff_unique_dep": "EECS and Digital Futures;", "aff_unique_url": "https://www.kth.se;https://www.cyberagent.co.jp", "aff_unique_abbr": "KTH;", "aff_campus_unique_index": "0+1;1;0", "aff_campus_unique": "Stockholm;Tokyo", "aff_country_unique_index": "0+1;1;0", "aff_country_unique": "Sweden;Japan" }, { "title": "Tight and Robust Private Mean Estimation with Few Users", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16313", "id": "16313", "proceeding": "https://proceedings.mlr.press/v162/narayanan22a.html", "poster": "/media/PosterPDFs/ICML%202022/12ffb0968f2f56e51a59a6beb37b2859.png?t=1658243920.5629108", "slides": "", "author_site": "Shyam Narayanan, Vahab Mirrokni, Hossein Esfandiari", "author": "Shyam Narayanan; Vahab Mirrokni; Hossein Esfandiari", "abstract": "In this work, we study high-dimensional mean estimation under user-level differential privacy, and design an $(\\varepsilon,\\delta)$-differentially private mechanism using as few users as possible. In particular, we provide a nearly optimal trade-off between the number of users and the number of samples per user required for private mean estimation, even when the number of users is as low as $O(\\frac{1}{\\varepsilon}\\log\\frac{1}{\\delta})$. Interestingly, this bound on the number of", "bibtex": "@InProceedings{pmlr-v162-narayanan22a,\n title = \t {Tight and Robust Private Mean Estimation with Few Users},\n author = {Narayanan, Shyam and Mirrokni, Vahab and Esfandiari, Hossein},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16383--16412},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/narayanan22a/narayanan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/narayanan22a.html},\n abstract = \t {In this work, we study high-dimensional mean estimation under user-level differential privacy, and design an $(\\varepsilon,\\delta)$-differentially private mechanism using as few users as possible. In particular, we provide a nearly optimal trade-off between the number of users and the number of samples per user required for private mean estimation, even when the number of users is as low as $O(\\frac{1}{\\varepsilon}\\log\\frac{1}{\\delta})$. Interestingly, this bound on the number of", "pdf": "https://proceedings.mlr.press/v162/narayanan22a/narayanan22a.pdf", "supp": "", "pdf_size": 511488, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4494676039566063321&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff": "Google Research, New York, NY, USA; Google Research, New York, NY, USA; Massachusetts Institute of Technology, Cambridge, MA, USA", "aff_domain": "googleresearch.com;googleresearch.com;mit.edu", "email": "googleresearch.com;googleresearch.com;mit.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/narayanan22a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Google;Massachusetts Institute of Technology", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://www.mit.edu", "aff_unique_abbr": "Google Research;MIT", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "New York;Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Time Is MattEr: Temporal Self-supervision for Video Transformers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18243", "id": "18243", "proceeding": "https://proceedings.mlr.press/v162/yun22a.html", "poster": "/media/PosterPDFs/ICML%202022/854f1fb6f65734d9e49f708d6cd84ad6_moAF2FG.png?t=1657677747.1491117", "slides": "", "author_site": "Sukmin Yun, Jaehyung Kim, Dongyoon Han, Hwanjun Song, Jung-Woo Ha, Jinwoo Shin", "author": "Sukmin Yun; Jaehyung Kim; Dongyoon Han; Hwanjun Song; Jung-Woo Ha; Jinwoo Shin", "abstract": "Understanding temporal dynamics of video is an essential aspect of learning better video representations. Recently, transformer-based architectural designs have been extensively explored for video tasks due to their capability to capture long-term dependency of input sequences. However, we found that these Video Transformers are still biased to learn spatial dynamics rather than temporal ones, and debiasing the spurious correlation is critical for their performance. Based on the observations, we design simple yet effective self-supervised tasks for video models to learn temporal dynamics better. Specifically, for debiasing the spatial bias, our method learns the temporal order of video frames as extra self-supervision and enforces the randomly shuffled frames to have low-confidence outputs. Also, our method learns the temporal flow direction of video tokens among consecutive frames for enhancing the correlation toward temporal dynamics. Under various video action recognition tasks, we demonstrate the effectiveness of our method and its compatibility with state-of-the-art Video Transformers.", "bibtex": "@InProceedings{pmlr-v162-yun22a,\n title = \t {Time Is {M}att{E}r: Temporal Self-supervision for Video Transformers},\n author = {Yun, Sukmin and Kim, Jaehyung and Han, Dongyoon and Song, Hwanjun and Ha, Jung-Woo and Shin, Jinwoo},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25804--25816},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yun22a/yun22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/yun22a.html},\n abstract = \t {Understanding temporal dynamics of video is an essential aspect of learning better video representations. Recently, transformer-based architectural designs have been extensively explored for video tasks due to their capability to capture long-term dependency of input sequences. However, we found that these Video Transformers are still biased to learn spatial dynamics rather than temporal ones, and debiasing the spurious correlation is critical for their performance. Based on the observations, we design simple yet effective self-supervised tasks for video models to learn temporal dynamics better. Specifically, for debiasing the spatial bias, our method learns the temporal order of video frames as extra self-supervision and enforces the randomly shuffled frames to have low-confidence outputs. Also, our method learns the temporal flow direction of video tokens among consecutive frames for enhancing the correlation toward temporal dynamics. Under various video action recognition tasks, we demonstrate the effectiveness of our method and its compatibility with state-of-the-art Video Transformers.}\n}", "pdf": "https://proceedings.mlr.press/v162/yun22a/yun22a.pdf", "supp": "", "pdf_size": 8202274, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10001737047837090145&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "School of Electrical Engineering, KAIST, South Korea; School of Electrical Engineering, KAIST, South Korea; NAVER AI Lab, South Korea; NAVER AI Lab, South Korea; NAVER AI Lab, South Korea; Graduate School of AI, KAIST, South Korea + School of Electrical Engineering, KAIST, South Korea", "aff_domain": "kaist.ac.kr; ; ; ; ; ", "email": "kaist.ac.kr; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/yun22a.html", "aff_unique_index": "0;0;1;1;1;0+0", "aff_unique_norm": "KAIST;NAVER AI Lab", "aff_unique_dep": "School of Electrical Engineering;AI Lab", "aff_unique_url": "https://www.kaist.ac.kr;https://www.naver.com", "aff_unique_abbr": "KAIST;NAVER AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0+0", "aff_country_unique": "South Korea" }, { "title": "To Smooth or Not? When Label Smoothing Meets Noisy Labels", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17073", "id": "17073", "proceeding": "https://proceedings.mlr.press/v162/wei22b.html", "poster": "", "slides": "", "author_site": "Jiaheng Wei, Hangyu Liu, Tongliang Liu, Gang Niu, Masashi Sugiyama, Yang Liu", "author": "Jiaheng Wei; Hangyu Liu; Tongliang Liu; Gang Niu; Masashi Sugiyama; Yang Liu", "abstract": "Label smoothing (LS) is an arising learning paradigm that uses the positively weighted average of both the hard training labels and uniformly distributed soft labels. It was shown that LS serves as a regularizer for training data with hard labels and therefore improves the generalization of the model. Later it was reported LS even helps with improving robustness when learning with noisy labels. However, we observed that the advantage of LS vanishes when we operate in a high label noise regime. Intuitively speaking, this is due to the increased entropy of P(noisy label|X) when the noise rate is high, in which case, further applying LS tends to \u201cover-smooth\u201d the estimated posterior. We proceeded to discover that several learning-with-noisy-labels solutions in the literature instead relate more closely to negative/not label smoothing (NLS), which acts counter to LS and defines as using a negative weight to combine the hard and soft labels! We provide understandings for the properties of LS and NLS when learning with noisy labels. Among other established properties, we theoretically show NLS is considered more beneficial when the label noise rates are high. We provide extensive experimental results on multiple benchmarks to support our findings too. Code is publicly available at https://github.com/UCSC-REAL/negative-label-smoothing.", "bibtex": "@InProceedings{pmlr-v162-wei22b,\n title = \t {To Smooth or Not? {W}hen Label Smoothing Meets Noisy Labels},\n author = {Wei, Jiaheng and Liu, Hangyu and Liu, Tongliang and Niu, Gang and Sugiyama, Masashi and Liu, Yang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23589--23614},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wei22b/wei22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/wei22b.html},\n abstract = \t {Label smoothing (LS) is an arising learning paradigm that uses the positively weighted average of both the hard training labels and uniformly distributed soft labels. It was shown that LS serves as a regularizer for training data with hard labels and therefore improves the generalization of the model. Later it was reported LS even helps with improving robustness when learning with noisy labels. However, we observed that the advantage of LS vanishes when we operate in a high label noise regime. Intuitively speaking, this is due to the increased entropy of P(noisy label|X) when the noise rate is high, in which case, further applying LS tends to \u201cover-smooth\u201d the estimated posterior. We proceeded to discover that several learning-with-noisy-labels solutions in the literature instead relate more closely to negative/not label smoothing (NLS), which acts counter to LS and defines as using a negative weight to combine the hard and soft labels! We provide understandings for the properties of LS and NLS when learning with noisy labels. Among other established properties, we theoretically show NLS is considered more beneficial when the label noise rates are high. We provide extensive experimental results on multiple benchmarks to support our findings too. Code is publicly available at https://github.com/UCSC-REAL/negative-label-smoothing.}\n}", "pdf": "https://proceedings.mlr.press/v162/wei22b/wei22b.pdf", "supp": "", "pdf_size": 8090548, "gs_citation": 99, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18297648993704774023&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "https://github.com/UCSC-REAL/negative-label-smoothing", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wei22b.html" }, { "title": "Topology-Aware Network Pruning using Multi-stage Graph Embedding and Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16771", "id": "16771", "proceeding": "https://proceedings.mlr.press/v162/yu22e.html", "poster": "/media/PosterPDFs/ICML%202022/821fa74b50ba3f7cba1e6c53e8fa6845.png?t=1657732927.3571782", "slides": "", "author_site": "Sixing Yu, Arya Mazaheri, Ali Jannesari", "author": "Sixing Yu; Arya Mazaheri; Ali Jannesari", "abstract": "Model compression is an essential technique for deploying deep neural networks (DNNs) on power and memory-constrained resources. However, existing model-compression methods often rely on human expertise and focus on parameters\u2019 local importance, ignoring the rich topology information within DNNs. In this paper, we propose a novel multi-stage graph embedding technique based on graph neural networks (GNNs) to identify DNN topologies and use reinforcement learning (RL) to find a suitable compression policy. We performed resource-constrained (i.e., FLOPs) channel pruning and compared our approach with state-of-the-art model compression methods. We evaluated our method on various models from typical to mobile-friendly networks, such as ResNet family, VGG-16, MobileNet-v1/v2, and ShuffleNet. Results show that our method can achieve higher compression ratios with a minimal fine-tuning cost yet yields outstanding and competitive performance.", "bibtex": "@InProceedings{pmlr-v162-yu22e,\n title = \t {Topology-Aware Network Pruning using Multi-stage Graph Embedding and Reinforcement Learning},\n author = {Yu, Sixing and Mazaheri, Arya and Jannesari, Ali},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25656--25667},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yu22e/yu22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/yu22e.html},\n abstract = \t {Model compression is an essential technique for deploying deep neural networks (DNNs) on power and memory-constrained resources. However, existing model-compression methods often rely on human expertise and focus on parameters\u2019 local importance, ignoring the rich topology information within DNNs. In this paper, we propose a novel multi-stage graph embedding technique based on graph neural networks (GNNs) to identify DNN topologies and use reinforcement learning (RL) to find a suitable compression policy. We performed resource-constrained (i.e., FLOPs) channel pruning and compared our approach with state-of-the-art model compression methods. We evaluated our method on various models from typical to mobile-friendly networks, such as ResNet family, VGG-16, MobileNet-v1/v2, and ShuffleNet. Results show that our method can achieve higher compression ratios with a minimal fine-tuning cost yet yields outstanding and competitive performance.}\n}", "pdf": "https://proceedings.mlr.press/v162/yu22e/yu22e.pdf", "supp": "", "pdf_size": 610272, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9807843131373835884&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, Iowa State University, Iowa, US+Department of Computer Science, Technical University of Darmstadt, Darmstadt, Germany; Department of Computer Science, Technical University of Darmstadt, Darmstadt, Germany; Department of Computer Science, Iowa State University, Iowa, US", "aff_domain": "iastate.edu;tudarmstadt.de;iastate.edu", "email": "iastate.edu;tudarmstadt.de;iastate.edu", "github": "https://github.com/yusx-swapp/GNN-RL-Model-Compression", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/yu22e.html", "aff_unique_index": "0+1;1;0", "aff_unique_norm": "Iowa State University;Technical University of Darmstadt", "aff_unique_dep": "Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.iastate.edu;https://www.tu-darmstadt.de", "aff_unique_abbr": "ISU;TUD", "aff_campus_unique_index": "0+1;1;0", "aff_campus_unique": "Iowa;Darmstadt", "aff_country_unique_index": "0+1;1;0", "aff_country_unique": "United States;Germany" }, { "title": "Topology-aware Generalization of Decentralized SGD", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18373", "id": "18373", "proceeding": "https://proceedings.mlr.press/v162/zhu22d.html", "poster": "/media/PosterPDFs/ICML%202022/1f34004ebcb05f9acda6016d5cc52d5e_cktj852.png?t=1658258150.4009476", "slides": "", "author_site": "Tongtian Zhu, Fengxiang He, Lan Zhang, Zhengyang Niu, Mingli Song, Dacheng Tao", "author": "Tongtian Zhu; Fengxiang He; Lan Zhang; Zhengyang Niu; Mingli Song; Dacheng Tao", "abstract": "This paper studies the algorithmic stability and generalizability of decentralized stochastic gradient descent (D-SGD). We prove that the consensus model learned by D-SGD is $\\mathcal{O}{(m/N\\unaryplus1/m\\unaryplus\\lambda^2)}$-stable in expectation in the non-convex non-smooth setting, where $N$ is the total sample size of the whole system, $m$ is the worker number, and $1\\unaryminus\\lambda$ is the spectral gap that measures the connectivity of the communication topology. These results then deliver an $\\mathcal{O}{(1/N\\unaryplus{({(m^{-1}\\lambda^2)}^{\\frac{\\alpha}{2}}\\unaryplus m^{\\unaryminus\\alpha})}/{N^{1\\unaryminus\\frac{\\alpha}{2}}})}$ in-average generalization bound, which is non-vacuous even when $\\lambda$ is closed to $1$, in contrast to vacuous as suggested by existing literature on the projected version of D-SGD. Our theory indicates that the generalizability of D-SGD has a positive correlation with the spectral gap, and can explain why consensus control in initial training phase can ensure better generalization. Experiments of VGG-11 and ResNet-18 on CIFAR-10, CIFAR-100 and Tiny-ImageNet justify our theory. To our best knowledge, this is the first work on the topology-aware generalization of vanilla D-SGD. Code is available at \\url{https://github.com/Raiden-Zhu/Generalization-of-DSGD}.", "bibtex": "@InProceedings{pmlr-v162-zhu22d,\n title = \t {Topology-aware Generalization of Decentralized {SGD}},\n author = {Zhu, Tongtian and He, Fengxiang and Zhang, Lan and Niu, Zhengyang and Song, Mingli and Tao, Dacheng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27479--27503},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhu22d/zhu22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhu22d.html},\n abstract = \t {This paper studies the algorithmic stability and generalizability of decentralized stochastic gradient descent (D-SGD). We prove that the consensus model learned by D-SGD is $\\mathcal{O}{(m/N\\unaryplus1/m\\unaryplus\\lambda^2)}$-stable in expectation in the non-convex non-smooth setting, where $N$ is the total sample size of the whole system, $m$ is the worker number, and $1\\unaryminus\\lambda$ is the spectral gap that measures the connectivity of the communication topology. These results then deliver an $\\mathcal{O}{(1/N\\unaryplus{({(m^{-1}\\lambda^2)}^{\\frac{\\alpha}{2}}\\unaryplus m^{\\unaryminus\\alpha})}/{N^{1\\unaryminus\\frac{\\alpha}{2}}})}$ in-average generalization bound, which is non-vacuous even when $\\lambda$ is closed to $1$, in contrast to vacuous as suggested by existing literature on the projected version of D-SGD. Our theory indicates that the generalizability of D-SGD has a positive correlation with the spectral gap, and can explain why consensus control in initial training phase can ensure better generalization. Experiments of VGG-11 and ResNet-18 on CIFAR-10, CIFAR-100 and Tiny-ImageNet justify our theory. To our best knowledge, this is the first work on the topology-aware generalization of vanilla D-SGD. Code is available at \\url{https://github.com/Raiden-Zhu/Generalization-of-DSGD}.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhu22d/zhu22d.pdf", "supp": "", "pdf_size": 1411408, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17709285400263398599&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "College of Computer Science and Technology, Zhejiang University+Shanghai Institute for Advanced Study of Zhejiang University+JD Explore Academy, JD.com Inc.; JD Explore Academy, JD.com Inc.; School of Computer Science and Technology, University of Science and Technology of China+Institute of Artificial Intelligence, Hefei Comprehensive National Science Center; School of Computer Science, Wuhan University; Zhejiang University City College; JD Explore Academy, JD.com Inc.", "aff_domain": "gmail.com; ; ; ; ; ", "email": "gmail.com; ; ; ; ; ", "github": "https://github.com/Raiden-Zhu/Generalization-of-DSGD", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/zhu22d.html", "aff_unique_index": "0+0+1;1;2+3;4;5;1", "aff_unique_norm": "Zhejiang University;JD.com Inc.;University of Science and Technology of China;Hefei Comprehensive National Science Center;Wuhan University;Zhejiang University City College", "aff_unique_dep": "College of Computer Science and Technology;JD Explore Academy;School of Computer Science and Technology;Institute of Artificial Intelligence;School of Computer Science;", "aff_unique_url": "http://www.zju.edu.cn;https://www.jd.com;http://www.ustc.edu.cn;http://www.hfcn.edu.cn;http://www.whu.edu.cn;http://www.zucc.edu.cn", "aff_unique_abbr": "ZJU;JD.com;USTC;;WHU;", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Shanghai;Hefei;Wuhan", "aff_country_unique_index": "0+0+0;0;0+0;0;0;0", "aff_country_unique": "China" }, { "title": "Toward Compositional Generalization in Object-Oriented World Modeling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18211", "id": "18211", "proceeding": "https://proceedings.mlr.press/v162/zhao22b.html", "poster": "", "slides": "", "author_site": "Linfeng Zhao, Lingzhi Kong, Robin Walters, Lawson Wong", "author": "Linfeng Zhao; Lingzhi Kong; Robin Walters; Lawson L.S. Wong", "abstract": "Compositional generalization is a critical ability in learning and decision-making. We focus on the setting of reinforcement learning in object-oriented environments to study compositional generalization in world modeling. We (1) formalize the compositional generalization problem with an algebraic approach and (2) study how a world model can achieve that. We introduce a conceptual environment, Object Library, and two instances, and deploy a principled pipeline to measure the generalization ability. Motivated by the formulation, we analyze several methods with exact or no compositional generalization ability using our framework, and design a differentiable approach, Homomorphic Object-oriented World Model (HOWM), that achieves soft but more efficient compositional generalization.", "bibtex": "@InProceedings{pmlr-v162-zhao22b,\n title = \t {Toward Compositional Generalization in Object-Oriented World Modeling},\n author = {Zhao, Linfeng and Kong, Lingzhi and Walters, Robin and Wong, Lawson L.S.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26841--26864},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhao22b/zhao22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhao22b.html},\n abstract = \t {Compositional generalization is a critical ability in learning and decision-making. We focus on the setting of reinforcement learning in object-oriented environments to study compositional generalization in world modeling. We (1) formalize the compositional generalization problem with an algebraic approach and (2) study how a world model can achieve that. We introduce a conceptual environment, Object Library, and two instances, and deploy a principled pipeline to measure the generalization ability. Motivated by the formulation, we analyze several methods with exact or no compositional generalization ability using our framework, and design a differentiable approach, Homomorphic Object-oriented World Model (HOWM), that achieves soft but more efficient compositional generalization.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhao22b/zhao22b.pdf", "supp": "", "pdf_size": 1894133, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9799895246949878920&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Khoury College of Computer Sciences, Northeastern University, MA; Khoury College of Computer Sciences, Northeastern University, MA; Khoury College of Computer Sciences, Northeastern University, MA; Khoury College of Computer Sciences, Northeastern University, MA", "aff_domain": "northeastern.edu; ; ; ", "email": "northeastern.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhao22b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "Khoury College of Computer Sciences", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "MA", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Towards Coherent and Consistent Use of Entities in Narrative Generation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16355", "id": "16355", "proceeding": "https://proceedings.mlr.press/v162/papalampidi22a.html", "poster": "/media/PosterPDFs/ICML%202022/c61aed648da48aa3893fb3eaadd88a7f.png?t=1657531024.274415", "slides": "", "author_site": "Pinelopi Papalampidi, Kris Cao, Tomas Kocisky", "author": "Pinelopi Papalampidi; Kris Cao; Tomas Kocisky", "abstract": "Large pre-trained language models (LMs) have demonstrated impressive capabilities in generating long, fluent text; however, there is little to no analysis on their ability to maintain entity coherence and consistency. In this work, we focus on the end task of narrative generation and systematically analyse the long-range entity coherence and consistency in generated stories. First, we propose a set of automatic metrics for measuring model performance in terms of entity usage. Given these metrics, we quantify the limitations of current LMs. Next, we propose augmenting a pre-trained LM with a dynamic entity memory in an end-to-end manner by using an auxiliary entity-related loss for guiding the reads and writes to the memory. We demonstrate that the dynamic entity memory increases entity coherence according to both automatic and human judgment and helps preserving entity-related information especially in settings with a limited context window. Finally, we also validate that our automatic metrics are correlated with human ratings and serve as a good indicator of the quality of generated stories.", "bibtex": "@InProceedings{pmlr-v162-papalampidi22a,\n title = \t {Towards Coherent and Consistent Use of Entities in Narrative Generation},\n author = {Papalampidi, Pinelopi and Cao, Kris and Kocisky, Tomas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17278--17294},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/papalampidi22a/papalampidi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/papalampidi22a.html},\n abstract = \t {Large pre-trained language models (LMs) have demonstrated impressive capabilities in generating long, fluent text; however, there is little to no analysis on their ability to maintain entity coherence and consistency. In this work, we focus on the end task of narrative generation and systematically analyse the long-range entity coherence and consistency in generated stories. First, we propose a set of automatic metrics for measuring model performance in terms of entity usage. Given these metrics, we quantify the limitations of current LMs. Next, we propose augmenting a pre-trained LM with a dynamic entity memory in an end-to-end manner by using an auxiliary entity-related loss for guiding the reads and writes to the memory. We demonstrate that the dynamic entity memory increases entity coherence according to both automatic and human judgment and helps preserving entity-related information especially in settings with a limited context window. Finally, we also validate that our automatic metrics are correlated with human ratings and serve as a good indicator of the quality of generated stories.}\n}", "pdf": "https://proceedings.mlr.press/v162/papalampidi22a/papalampidi22a.pdf", "supp": "", "pdf_size": 666438, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3888239152813411015&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of Edinburgh, UK+DeepMind, UK; DeepMind, UK; DeepMind, UK", "aff_domain": "sms.ed.ac.uk; ; ", "email": "sms.ed.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/papalampidi22a.html", "aff_unique_index": "0+1;1;1", "aff_unique_norm": "University of Edinburgh;DeepMind", "aff_unique_dep": ";", "aff_unique_url": "https://www.ed.ac.uk;https://deepmind.com", "aff_unique_abbr": "Edinburgh;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Towards Evaluating Adaptivity of Model-Based Reinforcement Learning Methods", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17973", "id": "17973", "proceeding": "https://proceedings.mlr.press/v162/wan22d.html", "poster": "/media/PosterPDFs/ICML%202022/443dec3062d0286986e21dc0631734c9.png?t=1656158414.2565486", "slides": "", "author_site": "Yi Wan, Ali Rahimi-Kalahroudi, Janarthanan Rajendran, Ida Momennejad, Sarath Chandar, Harm van Seijen", "author": "Yi Wan; Ali Rahimi-Kalahroudi; Janarthanan Rajendran; Ida Momennejad; Sarath Chandar; Harm H Van Seijen", "abstract": "In recent years, a growing number of deep model-based reinforcement learning (RL) methods have been introduced. The interest in deep model-based RL is not surprising, given its many potential benefits, such as higher sample efficiency and the potential for fast adaption to changes in the environment. However, we demonstrate, using an improved version of the recently introduced Local Change Adaptation (LoCA) setup, that well-known model-based methods such as PlaNet and DreamerV2 perform poorly in their ability to adapt to local environmental changes. Combined with prior work that made a similar observation about the other popular model-based method, MuZero, a trend appears to emerge, suggesting that current deep model-based methods have serious limitations. We dive deeper into the causes of this poor performance, by identifying elements that hurt adaptive behavior and linking these to underlying techniques frequently used in deep model-based RL. We empirically validate these insights in the case of linear function approximation by demonstrating that a modified version of linear Dyna achieves effective adaptation to local changes. Furthermore, we provide detailed insights into the challenges of building an adaptive nonlinear model-based method, by experimenting with a nonlinear version of Dyna.", "bibtex": "@InProceedings{pmlr-v162-wan22d,\n title = \t {Towards Evaluating Adaptivity of Model-Based Reinforcement Learning Methods},\n author = {Wan, Yi and Rahimi-Kalahroudi, Ali and Rajendran, Janarthanan and Momennejad, Ida and Chandar, Sarath and Van Seijen, Harm H},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22536--22561},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wan22d/wan22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/wan22d.html},\n abstract = \t {In recent years, a growing number of deep model-based reinforcement learning (RL) methods have been introduced. The interest in deep model-based RL is not surprising, given its many potential benefits, such as higher sample efficiency and the potential for fast adaption to changes in the environment. However, we demonstrate, using an improved version of the recently introduced Local Change Adaptation (LoCA) setup, that well-known model-based methods such as PlaNet and DreamerV2 perform poorly in their ability to adapt to local environmental changes. Combined with prior work that made a similar observation about the other popular model-based method, MuZero, a trend appears to emerge, suggesting that current deep model-based methods have serious limitations. We dive deeper into the causes of this poor performance, by identifying elements that hurt adaptive behavior and linking these to underlying techniques frequently used in deep model-based RL. We empirically validate these insights in the case of linear function approximation by demonstrating that a modified version of linear Dyna achieves effective adaptation to local changes. Furthermore, we provide detailed insights into the challenges of building an adaptive nonlinear model-based method, by experimenting with a nonlinear version of Dyna.}\n}", "pdf": "https://proceedings.mlr.press/v162/wan22d/wan22d.pdf", "supp": "", "pdf_size": 6053584, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8278156303366460605&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "University of Alberta + Mila - Quebec AI Institute + Universite de Montreal; Mila - Quebec AI Institute + Universite de Montreal; Mila - Quebec AI Institute + Universite de Montreal; Microsoft; Mila - Quebec AI Institute + Ecole Polytechnique de Montreal + Canada CIFAR AI Chair; Microsoft", "aff_domain": "ualberta.ca;mila.quebec; ; ; ; ", "email": "ualberta.ca;mila.quebec; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/wan22d.html", "aff_unique_index": "0+1+2;1+2;1+2;3;1+4+5;3", "aff_unique_norm": "University of Alberta;Quebec AI Institute;Universit\u00e9 de Montr\u00e9al;Microsoft;Ecole Polytechnique de Montreal;Canadian Institute for Advanced Research", "aff_unique_dep": ";AI Institute;;Microsoft Corporation;;AI Chair", "aff_unique_url": "https://www.ualberta.ca;https://mila.quebec;https://www.umontreal.ca;https://www.microsoft.com;https://www.polymtl.ca;https://www.cifar.ca", "aff_unique_abbr": "UAlberta;Mila;UM;Microsoft;Polytechnique Montreal;CIFAR", "aff_campus_unique_index": ";;;1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0+0+0;0+0;0+0;1;0+0+0;1", "aff_country_unique": "Canada;United States" }, { "title": "Towards Noise-adaptive, Problem-adaptive (Accelerated) Stochastic Gradient Descent", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16309", "id": "16309", "proceeding": "https://proceedings.mlr.press/v162/vaswani22a.html", "poster": "/media/PosterPDFs/ICML%202022/477b02d99dd6c00c5ba852bb9a9e1f6c.png?t=1658150185.9086707", "slides": "", "author_site": "Sharan Vaswani, Benjamin Dubois-Taine, Reza Babanezhad", "author": "Sharan Vaswani; Benjamin Dubois-Taine; Reza Babanezhad", "abstract": "We aim to make stochastic gradient descent (SGD) adaptive to (i) the noise $\\sigma^2$ in the stochastic gradients and (ii) problem-dependent constants. When minimizing smooth, strongly-convex functions with condition number $\\kappa$, we prove that $T$ iterations of SGD with exponentially decreasing step-sizes and knowledge of the smoothness can achieve an $\\tilde{O} \\left(\\exp \\left( \\nicefrac{-T}{\\kappa} \\right) + \\nicefrac{\\sigma^2}{T} \\right)$ rate, without knowing $\\sigma^2$. In order to be adaptive to the smoothness, we use a stochastic line-search (SLS) and show (via upper and lower-bounds) that SGD with SLS converges at the desired rate, but only to a neighbourhood of the solution. On the other hand, we prove that SGD with an offline estimate of the smoothness converges to the minimizer. However, its rate is slowed down proportional to the estimation error. Next, we prove that SGD with Nesterov acceleration and exponential step-sizes (referred to as ASGD) can achieve the near-optimal $\\tilde{O} \\left(\\exp \\left( \\nicefrac{-T}{\\sqrt{\\kappa}} \\right) + \\nicefrac{\\sigma^2}{T} \\right)$ rate, without knowledge of $\\sigma^2$. When used with offline estimates of the smoothness and strong-convexity, ASGD still converges to the solution, albeit at a slower rate. Finally, we empirically demonstrate the effectiveness of exponential step-sizes coupled with a novel variant of SLS.", "bibtex": "@InProceedings{pmlr-v162-vaswani22a,\n title = \t {Towards Noise-adaptive, Problem-adaptive ({A}ccelerated) Stochastic Gradient Descent},\n author = {Vaswani, Sharan and Dubois-Taine, Benjamin and Babanezhad, Reza},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22015--22059},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vaswani22a/vaswani22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/vaswani22a.html},\n abstract = \t {We aim to make stochastic gradient descent (SGD) adaptive to (i) the noise $\\sigma^2$ in the stochastic gradients and (ii) problem-dependent constants. When minimizing smooth, strongly-convex functions with condition number $\\kappa$, we prove that $T$ iterations of SGD with exponentially decreasing step-sizes and knowledge of the smoothness can achieve an $\\tilde{O} \\left(\\exp \\left( \\nicefrac{-T}{\\kappa} \\right) + \\nicefrac{\\sigma^2}{T} \\right)$ rate, without knowing $\\sigma^2$. In order to be adaptive to the smoothness, we use a stochastic line-search (SLS) and show (via upper and lower-bounds) that SGD with SLS converges at the desired rate, but only to a neighbourhood of the solution. On the other hand, we prove that SGD with an offline estimate of the smoothness converges to the minimizer. However, its rate is slowed down proportional to the estimation error. Next, we prove that SGD with Nesterov acceleration and exponential step-sizes (referred to as ASGD) can achieve the near-optimal $\\tilde{O} \\left(\\exp \\left( \\nicefrac{-T}{\\sqrt{\\kappa}} \\right) + \\nicefrac{\\sigma^2}{T} \\right)$ rate, without knowledge of $\\sigma^2$. When used with offline estimates of the smoothness and strong-convexity, ASGD still converges to the solution, albeit at a slower rate. Finally, we empirically demonstrate the effectiveness of exponential step-sizes coupled with a novel variant of SLS.}\n}", "pdf": "https://proceedings.mlr.press/v162/vaswani22a/vaswani22a.pdf", "supp": "", "pdf_size": 723134, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7346930168965224831&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Simon Fraser University; DI ENS, Ecole normale sup\u00e9rieure, Universit\u00e9 PSL, CNRS, INRIA, 75005 Paris, France; SAIT AI lab, Montreal", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/vaswani22a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Simon Fraser University;Ecole Normale Sup\u00e9rieure;SAIT AI lab", "aff_unique_dep": ";DI ENS;AI lab", "aff_unique_url": "https://www.sfu.ca;https://www.ens.fr;", "aff_unique_abbr": "SFU;ENS;", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Paris;Montreal", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Canada;France" }, { "title": "Towards Scaling Difference Target Propagation by Learning Backprop Targets", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18239", "id": "18239", "proceeding": "https://proceedings.mlr.press/v162/ernoult22a.html", "poster": "", "slides": "", "author_site": "Maxence ERNOULT, Fabrice Normandin, Abhinav Moudgil, Sean Spinney, Eugene Belilovsky, Irina Rish, Blake Richards, Yoshua Bengio", "author": "Maxence M Ernoult; Fabrice Normandin; Abhinav Moudgil; Sean Spinney; Eugene Belilovsky; Irina Rish; Blake Richards; Yoshua Bengio", "abstract": "The development of biologically-plausible learning algorithms is important for understanding learning in the brain, but most of them fail to scale-up to real-world tasks, limiting their potential as explanations for learning by real brains. As such, it is important to explore learning algorithms that come with strong theoretical guarantees and can match the performance of backpropagation (BP) on complex tasks. One such algorithm is Difference Target Propagation (DTP), a biologically-plausible learning algorithm whose close relation with Gauss-Newton (GN) optimization has been recently established. However, the conditions under which this connection rigorously holds preclude layer-wise training of the feedback pathway synaptic weights (which is more biologically plausible). Moreover, good alignment between DTP weight updates and loss gradients is only loosely guaranteed and under very specific conditions for the architecture being trained. In this paper, we propose a novel feedback weight training scheme that ensures both that DTP approximates BP and that layer-wise feedback weight training can be restored without sacrificing any theoretical guarantees. Our theory is corroborated by experimental results and we report the best performance ever achieved by DTP on CIFAR-10 and ImageNet 32x32.", "bibtex": "@InProceedings{pmlr-v162-ernoult22a,\n title = \t {Towards Scaling Difference Target Propagation by Learning Backprop Targets},\n author = {Ernoult, Maxence M and Normandin, Fabrice and Moudgil, Abhinav and Spinney, Sean and Belilovsky, Eugene and Rish, Irina and Richards, Blake and Bengio, Yoshua},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5968--5987},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ernoult22a/ernoult22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ernoult22a.html},\n abstract = \t {The development of biologically-plausible learning algorithms is important for understanding learning in the brain, but most of them fail to scale-up to real-world tasks, limiting their potential as explanations for learning by real brains. As such, it is important to explore learning algorithms that come with strong theoretical guarantees and can match the performance of backpropagation (BP) on complex tasks. One such algorithm is Difference Target Propagation (DTP), a biologically-plausible learning algorithm whose close relation with Gauss-Newton (GN) optimization has been recently established. However, the conditions under which this connection rigorously holds preclude layer-wise training of the feedback pathway synaptic weights (which is more biologically plausible). Moreover, good alignment between DTP weight updates and loss gradients is only loosely guaranteed and under very specific conditions for the architecture being trained. In this paper, we propose a novel feedback weight training scheme that ensures both that DTP approximates BP and that layer-wise feedback weight training can be restored without sacrificing any theoretical guarantees. Our theory is corroborated by experimental results and we report the best performance ever achieved by DTP on CIFAR-10 and ImageNet 32x32.}\n}", "pdf": "https://proceedings.mlr.press/v162/ernoult22a/ernoult22a.pdf", "supp": "", "pdf_size": 1393597, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16976057052458549832&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "email": ";;;;;;;", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/ernoult22a.html" }, { "title": "Towards Theoretical Analysis of Transformation Complexity of ReLU DNNs", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16187", "id": "16187", "proceeding": "https://proceedings.mlr.press/v162/ren22b.html", "poster": "/media/PosterPDFs/ICML%202022/c8ed21db4f678f3b13b9d5ee16489088.png?t=1658044115.5435405", "slides": "", "author_site": "Jie Ren, Mingjie Li, Meng Zhou, Shih-Han Chan, Quanshi Zhang", "author": "Jie Ren; Mingjie Li; Meng Zhou; Shih-Han Chan; Quanshi Zhang", "abstract": "This paper aims to theoretically analyze the complexity of feature transformations encoded in piecewise linear DNNs with ReLU layers. We propose metrics to measure three types of complexities of transformations based on the information theory. We further discover and prove the strong correlation between the complexity and the disentanglement of transformations. Based on the proposed metrics, we analyze two typical phenomena of the change of the transformation complexity during the training process, and explore the ceiling of a DNN\u2019s complexity. The proposed metrics can also be used as a loss to learn a DNN with the minimum complexity, which also controls the over-fitting level of the DNN and influences adversarial robustness, adversarial transferability, and knowledge consistency. Comprehensive comparative studies have provided new perspectives to understand the DNN. The code is released at https://github.com/sjtu-XAI-lab/transformation-complexity.", "bibtex": "@InProceedings{pmlr-v162-ren22b,\n title = \t {Towards Theoretical Analysis of Transformation Complexity of {R}e{LU} {DNN}s},\n author = {Ren, Jie and Li, Mingjie and Zhou, Meng and Chan, Shih-Han and Zhang, Quanshi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18537--18558},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ren22b/ren22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/ren22b.html},\n abstract = \t {This paper aims to theoretically analyze the complexity of feature transformations encoded in piecewise linear DNNs with ReLU layers. We propose metrics to measure three types of complexities of transformations based on the information theory. We further discover and prove the strong correlation between the complexity and the disentanglement of transformations. Based on the proposed metrics, we analyze two typical phenomena of the change of the transformation complexity during the training process, and explore the ceiling of a DNN\u2019s complexity. The proposed metrics can also be used as a loss to learn a DNN with the minimum complexity, which also controls the over-fitting level of the DNN and influences adversarial robustness, adversarial transferability, and knowledge consistency. Comprehensive comparative studies have provided new perspectives to understand the DNN. The code is released at https://github.com/sjtu-XAI-lab/transformation-complexity.}\n}", "pdf": "https://proceedings.mlr.press/v162/ren22b/ren22b.pdf", "supp": "", "pdf_size": 1660033, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1146425504680188001&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Shanghai Jiao Tong University; Shanghai Jiao Tong University; Carnegie Mellon University; University of California San Diego; Department of Computer Science and Engineering, the John Hopcroft Center, and the MoE Key Lab of Artificial Intelligence, AI Institute, at Shanghai Jiao Tong University", "aff_domain": "sjtu.edu.cn; ; ; ;sjtu.edu.cn", "email": "sjtu.edu.cn; ; ; ;sjtu.edu.cn", "github": "https://github.com/sjtu-XAI-lab/transformation-complexity", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/ren22b.html", "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "Shanghai Jiao Tong University;Carnegie Mellon University;University of California, San Diego", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.cmu.edu;https://ucsd.edu", "aff_unique_abbr": "SJTU;CMU;UCSD", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "China;United States" }, { "title": "Towards Understanding Sharpness-Aware Minimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18251", "id": "18251", "proceeding": "https://proceedings.mlr.press/v162/andriushchenko22a.html", "poster": "/media/PosterPDFs/ICML%202022/8c66bb19847dd8c21413c5c8c9d68306_lgrVQ3Q.png?t=1657899337.3643992", "slides": "", "author_site": "Maksym Andriushchenko, Nicolas Flammarion", "author": "Maksym Andriushchenko; Nicolas Flammarion", "abstract": "Sharpness-Aware Minimization (SAM) is a recent training method that relies on worst-case weight perturbations which significantly improves generalization in various settings. We argue that the existing justifications for the success of SAM which are based on a PAC-Bayes generalization bound and the idea of convergence to flat minima are incomplete. Moreover, there are no explanations for the success of using m-sharpness in SAM which has been shown as essential for generalization. To better understand this aspect of SAM, we theoretically analyze its implicit bias for diagonal linear networks. We prove that SAM always chooses a solution that enjoys better generalization properties than standard gradient descent for a certain class of problems, and this effect is amplified by using m-sharpness. We further study the properties of the implicit bias on non-linear networks empirically, where we show that fine-tuning a standard model with SAM can lead to significant generalization improvements. Finally, we provide convergence results of SAM for non-convex objectives when used with stochastic gradients. We illustrate these results empirically for deep networks and discuss their relation to the generalization behavior of SAM. The code of our experiments is available at https://github.com/tml-epfl/understanding-sam.", "bibtex": "@InProceedings{pmlr-v162-andriushchenko22a,\n title = \t {Towards Understanding Sharpness-Aware Minimization},\n author = {Andriushchenko, Maksym and Flammarion, Nicolas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {639--668},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/andriushchenko22a/andriushchenko22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/andriushchenko22a.html},\n abstract = \t {Sharpness-Aware Minimization (SAM) is a recent training method that relies on worst-case weight perturbations which significantly improves generalization in various settings. We argue that the existing justifications for the success of SAM which are based on a PAC-Bayes generalization bound and the idea of convergence to flat minima are incomplete. Moreover, there are no explanations for the success of using m-sharpness in SAM which has been shown as essential for generalization. To better understand this aspect of SAM, we theoretically analyze its implicit bias for diagonal linear networks. We prove that SAM always chooses a solution that enjoys better generalization properties than standard gradient descent for a certain class of problems, and this effect is amplified by using m-sharpness. We further study the properties of the implicit bias on non-linear networks empirically, where we show that fine-tuning a standard model with SAM can lead to significant generalization improvements. Finally, we provide convergence results of SAM for non-convex objectives when used with stochastic gradients. We illustrate these results empirically for deep networks and discuss their relation to the generalization behavior of SAM. The code of our experiments is available at https://github.com/tml-epfl/understanding-sam.}\n}", "pdf": "https://proceedings.mlr.press/v162/andriushchenko22a/andriushchenko22a.pdf", "supp": "", "pdf_size": 1385881, "gs_citation": 177, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18222527206389875127&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "EPFL, Switzerland; EPFL, Switzerland", "aff_domain": "epfl.ch; ", "email": "epfl.ch; ", "github": "https://github.com/tml-epfl/understanding-sam", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/andriushchenko22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Towards Uniformly Superhuman Autonomy via Subdominance Minimization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16905", "id": "16905", "proceeding": "https://proceedings.mlr.press/v162/ziebart22a.html", "poster": "/media/PosterPDFs/ICML%202022/b056eb1587586b71e2da9acfe4fbd19e_OwFFX7w.png?t=1657733484.8093364", "slides": "", "author_site": "Brian Ziebart, Sanjiban Choudhury, Xinyan Yan, Paul Vernaza", "author": "Brian Ziebart; Sanjiban Choudhury; Xinyan Yan; Paul Vernaza", "abstract": "Prevalent imitation learning methods seek to produce behavior that matches or exceeds average human performance. This often prevents achieving expert-level or superhuman performance when identifying the better demonstrations to imitate is difficult. We instead assume demonstrations are of varying quality and seek to induce behavior that is unambiguously better (i.e., Pareto dominant or minimally subdominant) than all human demonstrations. Our minimum subdominance inverse optimal control training objective is primarily defined by high quality demonstrations; lower quality demonstrations, which are more easily dominated, are effectively ignored instead of degrading imitation. With increasing probability, our approach produces superhuman behavior incurring lower cost than demonstrations on the demonstrator\u2019s unknown cost function{\u2014}even if that cost function differs for each demonstration. We apply our approach on a computer cursor pointing task, producing behavior that is 78% superhuman, while minimizing demonstration suboptimality provides 50% superhuman behavior{\u2014}and only 72% even after selective data cleaning.", "bibtex": "@InProceedings{pmlr-v162-ziebart22a,\n title = \t {Towards Uniformly Superhuman Autonomy via Subdominance Minimization},\n author = {Ziebart, Brian and Choudhury, Sanjiban and Yan, Xinyan and Vernaza, Paul},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27654--27670},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ziebart22a/ziebart22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ziebart22a.html},\n abstract = \t {Prevalent imitation learning methods seek to produce behavior that matches or exceeds average human performance. This often prevents achieving expert-level or superhuman performance when identifying the better demonstrations to imitate is difficult. We instead assume demonstrations are of varying quality and seek to induce behavior that is unambiguously better (i.e., Pareto dominant or minimally subdominant) than all human demonstrations. Our minimum subdominance inverse optimal control training objective is primarily defined by high quality demonstrations; lower quality demonstrations, which are more easily dominated, are effectively ignored instead of degrading imitation. With increasing probability, our approach produces superhuman behavior incurring lower cost than demonstrations on the demonstrator\u2019s unknown cost function{\u2014}even if that cost function differs for each demonstration. We apply our approach on a computer cursor pointing task, producing behavior that is 78% superhuman, while minimizing demonstration suboptimality provides 50% superhuman behavior{\u2014}and only 72% even after selective data cleaning.}\n}", "pdf": "https://proceedings.mlr.press/v162/ziebart22a/ziebart22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/ziebart22a-supp.zip", "pdf_size": 1219730, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2340577340317935953&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Computer Science, University of Illinois Chicago; Aurora Innovation; Aurora Innovation; Aurora Innovation", "aff_domain": "uic.edu; ; ; ", "email": "uic.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/ziebart22a.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Illinois Chicago;Aurora Innovation", "aff_unique_dep": "Computer Science;", "aff_unique_url": "https://www.uic.edu;https://aurora.tech", "aff_unique_abbr": "UIC;Aurora", "aff_campus_unique_index": "0", "aff_campus_unique": "Chicago;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Towards understanding how momentum improves generalization in deep learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16111", "id": "16111", "proceeding": "https://proceedings.mlr.press/v162/jelassi22a.html", "poster": "/media/PosterPDFs/ICML%202022/70efba66d3d8d53194fb1a8446ae07fa.png?t=1657677192.8182702", "slides": "", "author_site": "Samy Jelassi, Yuanzhi Li", "author": "Samy Jelassi; Yuanzhi Li", "abstract": "Stochastic gradient descent (SGD) with momentum is widely used for training modern deep learning architectures. While it is well-understood that using momentum can lead to faster convergence rate in various settings, it has also been observed that momentum yields higher generalization. Prior work argue that momentum stabilizes the SGD noise during training and this leads to higher generalization. In this paper, we adopt another perspective and first empirically show that gradient descent with momentum (GD+M) significantly improves generalization compared to gradient descent (GD) in some deep learning problems. From this observation, we formally study how momentum improves generalization. We devise a binary classification setting where a one-hidden layer (over-parameterized) convolutional neural network trained with GD+M provably generalizes better than the same network trained with GD, when both algorithms are similarly initialized. The key insight in our analysis is that momentum is beneficial in datasets where the examples share some feature but differ in their margin. Contrary to GD that memorizes the small margin data, GD+M still learns the feature in these data thanks to its historical gradients. Lastly, we empirically validate our theoretical findings.", "bibtex": "@InProceedings{pmlr-v162-jelassi22a,\n title = \t {Towards understanding how momentum improves generalization in deep learning},\n author = {Jelassi, Samy and Li, Yuanzhi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9965--10040},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jelassi22a/jelassi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jelassi22a.html},\n abstract = \t {Stochastic gradient descent (SGD) with momentum is widely used for training modern deep learning architectures. While it is well-understood that using momentum can lead to faster convergence rate in various settings, it has also been observed that momentum yields higher generalization. Prior work argue that momentum stabilizes the SGD noise during training and this leads to higher generalization. In this paper, we adopt another perspective and first empirically show that gradient descent with momentum (GD+M) significantly improves generalization compared to gradient descent (GD) in some deep learning problems. From this observation, we formally study how momentum improves generalization. We devise a binary classification setting where a one-hidden layer (over-parameterized) convolutional neural network trained with GD+M provably generalizes better than the same network trained with GD, when both algorithms are similarly initialized. The key insight in our analysis is that momentum is beneficial in datasets where the examples share some feature but differ in their margin. Contrary to GD that memorizes the small margin data, GD+M still learns the feature in these data thanks to its historical gradients. Lastly, we empirically validate our theoretical findings.}\n}", "pdf": "https://proceedings.mlr.press/v162/jelassi22a/jelassi22a.pdf", "supp": "", "pdf_size": 2508701, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3413448426995846490&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Princeton University, NJ, USA; Carnegie Mellon University, PA, USA", "aff_domain": "princeton.edu; ", "email": "princeton.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/jelassi22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Princeton University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.cmu.edu", "aff_unique_abbr": "Princeton;CMU", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Princeton;Pittsburgh", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Tractable Dendritic RNNs for Reconstructing Nonlinear Dynamical Systems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17351", "id": "17351", "proceeding": "https://proceedings.mlr.press/v162/brenner22a.html", "poster": "/media/PosterPDFs/ICML%202022/01846ae470651e97d2f73fce979406a9.png?t=1657527717.8630378", "slides": "", "author_site": "Manuel Brenner, Florian Hess, Jonas M Mikhaeil, Leonard Bereska, Zahra Monfared, Po-Chen Kuo, Daniel Durstewitz", "author": "Manuel Brenner; Florian Hess; Jonas M Mikhaeil; Leonard F Bereska; Zahra Monfared; Po-Chen Kuo; Daniel Durstewitz", "abstract": "In many scientific disciplines, we are interested in inferring the nonlinear dynamical system underlying a set of observed time series, a challenging task in the face of chaotic behavior and noise. Previous deep learning approaches toward this goal often suffered from a lack of interpretability and tractability. In particular, the high-dimensional latent spaces often required for a faithful embedding, even when the underlying dynamics lives on a lower-dimensional manifold, can hamper theoretical analysis. Motivated by the emerging principles of dendritic computation, we augment a dynamically interpretable and mathematically tractable piecewise-linear (PL) recurrent neural network (RNN) by a linear spline basis expansion. We show that this approach retains all the theoretically appealing properties of the simple PLRNN, yet boosts its capacity for approximating arbitrary nonlinear dynamical systems in comparatively low dimensions. We employ two frameworks for training the system, one combining BPTT with teacher forcing, and another based on fast and scalable variational inference. We show that the dendritically expanded PLRNN achieves better reconstructions with fewer parameters and dimensions on various dynamical systems benchmarks and compares favorably to other methods, while retaining a tractable and interpretable structure.", "bibtex": "@InProceedings{pmlr-v162-brenner22a,\n title = \t {Tractable Dendritic {RNN}s for Reconstructing Nonlinear Dynamical Systems},\n author = {Brenner, Manuel and Hess, Florian and Mikhaeil, Jonas M and Bereska, Leonard F and Monfared, Zahra and Kuo, Po-Chen and Durstewitz, Daniel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2292--2320},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/brenner22a/brenner22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/brenner22a.html},\n abstract = \t {In many scientific disciplines, we are interested in inferring the nonlinear dynamical system underlying a set of observed time series, a challenging task in the face of chaotic behavior and noise. Previous deep learning approaches toward this goal often suffered from a lack of interpretability and tractability. In particular, the high-dimensional latent spaces often required for a faithful embedding, even when the underlying dynamics lives on a lower-dimensional manifold, can hamper theoretical analysis. Motivated by the emerging principles of dendritic computation, we augment a dynamically interpretable and mathematically tractable piecewise-linear (PL) recurrent neural network (RNN) by a linear spline basis expansion. We show that this approach retains all the theoretically appealing properties of the simple PLRNN, yet boosts its capacity for approximating arbitrary nonlinear dynamical systems in comparatively low dimensions. We employ two frameworks for training the system, one combining BPTT with teacher forcing, and another based on fast and scalable variational inference. We show that the dendritically expanded PLRNN achieves better reconstructions with fewer parameters and dimensions on various dynamical systems benchmarks and compares favorably to other methods, while retaining a tractable and interpretable structure.}\n}", "pdf": "https://proceedings.mlr.press/v162/brenner22a/brenner22a.pdf", "supp": "", "pdf_size": 3315832, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8212489607836330678&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/brenner22a.html" }, { "title": "Tractable Uncertainty for Structure Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18117", "id": "18117", "proceeding": "https://proceedings.mlr.press/v162/wang22ad.html", "poster": "/media/PosterPDFs/ICML%202022/0cb82dbdcda47e2ad7b7aaf69573906e.png?t=1657542500.3488443", "slides": "", "author_site": "Benjie Wang, Matthew Wicker, Marta Kwiatkowska", "author": "Benjie Wang; Matthew R Wicker; Marta Kwiatkowska", "abstract": "Bayesian structure learning allows one to capture uncertainty over the causal directed acyclic graph (DAG) responsible for generating given data. In this work, we present Tractable Uncertainty for STructure learning (TRUST), a framework for approximate posterior inference that relies on probabilistic circuits as a representation of our posterior belief. In contrast to sample-based posterior approximations, our representation can capture a much richer space of DAGs, while being able to tractably answer a range of useful inference queries. We empirically demonstrate how probabilistic circuits can be used to as an augmented representation for structure learning methods, leading to improvement in both the quality of inferred structures and posterior uncertainty. Experimental results also demonstrate the improved representational capacity of TRUST, outperforming competing methods on conditional query answering.", "bibtex": "@InProceedings{pmlr-v162-wang22ad,\n title = \t {Tractable Uncertainty for Structure Learning},\n author = {Wang, Benjie and Wicker, Matthew R and Kwiatkowska, Marta},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23131--23150},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22ad/wang22ad.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22ad.html},\n abstract = \t {Bayesian structure learning allows one to capture uncertainty over the causal directed acyclic graph (DAG) responsible for generating given data. In this work, we present Tractable Uncertainty for STructure learning (TRUST), a framework for approximate posterior inference that relies on probabilistic circuits as a representation of our posterior belief. In contrast to sample-based posterior approximations, our representation can capture a much richer space of DAGs, while being able to tractably answer a range of useful inference queries. We empirically demonstrate how probabilistic circuits can be used to as an augmented representation for structure learning methods, leading to improvement in both the quality of inferred structures and posterior uncertainty. Experimental results also demonstrate the improved representational capacity of TRUST, outperforming competing methods on conditional query answering.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22ad/wang22ad.pdf", "supp": "", "pdf_size": 1133543, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13676355292878487568&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, University of Oxford; Department of Computer Science, University of Oxford; Department of Computer Science, University of Oxford", "aff_domain": "cs.ox.ac.uk; ; ", "email": "cs.ox.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22ad.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Training Characteristic Functions with Reinforcement Learning: XAI-methods play Connect Four", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17941", "id": "17941", "proceeding": "https://proceedings.mlr.press/v162/waldchen22a.html", "poster": "/media/PosterPDFs/ICML%202022/e8855b3528cb03d1def9803220bd3cb9.png?t=1657490485.0897646", "slides": "", "author_site": "Stephan W\u00e4ldchen, Sebastian Pokutta, Felix Huber", "author": "Stephan W\u00e4ldchen; Sebastian Pokutta; Felix Huber", "abstract": "Characteristic functions (from cooperative game theory) are able to evaluate partial inputs and form the basis for attribution methods like Shapley values. These attribution methods allow us to measure how important each input component is for the function output\u2014one of the goals of explainable AI (XAI). Given a standard classifier function, it is unclear how partial input should be realised. Instead, most XAI-methods for black-box classifiers like neural networks consider counterfactual inputs that generally lie off-manifold, which makes them hard to evaluate and easy to manipulate. We propose a setup to directly train characteristic functions in the form of neural networks to play simple two-player games. We apply this to the game of Connect Four by randomly hiding colour information from our agents during training. This has three advantages for comparing XAI-methods: It alleviates the ambiguity about how to realise partial input, makes off-manifold evaluation unnecessary and allows us to compare the methods by letting them play against each other.", "bibtex": "@InProceedings{pmlr-v162-waldchen22a,\n title = \t {Training Characteristic Functions with Reinforcement Learning: {XAI}-methods play Connect Four},\n author = {W{\\\"a}ldchen, Stephan and Pokutta, Sebastian and Huber, Felix},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22457--22474},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/waldchen22a/waldchen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/waldchen22a.html},\n abstract = \t {Characteristic functions (from cooperative game theory) are able to evaluate partial inputs and form the basis for attribution methods like Shapley values. These attribution methods allow us to measure how important each input component is for the function output\u2014one of the goals of explainable AI (XAI). Given a standard classifier function, it is unclear how partial input should be realised. Instead, most XAI-methods for black-box classifiers like neural networks consider counterfactual inputs that generally lie off-manifold, which makes them hard to evaluate and easy to manipulate. We propose a setup to directly train characteristic functions in the form of neural networks to play simple two-player games. We apply this to the game of Connect Four by randomly hiding colour information from our agents during training. This has three advantages for comparing XAI-methods: It alleviates the ambiguity about how to realise partial input, makes off-manifold evaluation unnecessary and allows us to compare the methods by letting them play against each other.}\n}", "pdf": "https://proceedings.mlr.press/v162/waldchen22a/waldchen22a.pdf", "supp": "", "pdf_size": 974335, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7098175279247767006&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "TU Berlin & Zuse Institut Berlin; TU Berlin & Zuse Institut Berlin; TU Berlin & Zuse Institut Berlin", "aff_domain": "zib.de; ; ", "email": "zib.de; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/waldchen22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Berlin", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-berlin.de", "aff_unique_abbr": "TU Berlin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berlin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Training Discrete Deep Generative Models via Gapped Straight-Through Estimator", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16825", "id": "16825", "proceeding": "https://proceedings.mlr.press/v162/fan22a.html", "poster": "/media/PosterPDFs/ICML%202022/b3967a0e938dc2a6340e258630febd5a.png?t=1657238505.5391977", "slides": "", "author_site": "Ting-Han Fan, Ta-Chung Chi, Alexander Rudnicky, Peter Ramadge", "author": "Ting-Han Fan; Ta-Chung Chi; Alexander I. Rudnicky; Peter J Ramadge", "abstract": "While deep generative models have succeeded in image processing, natural language processing, and reinforcement learning, training that involves discrete random variables remains challenging due to the high variance of its gradient estimation process. Monte Carlo is a common solution used in most variance reduction approaches. However, this involves time-consuming resampling and multiple function evaluations. We propose a Gapped Straight-Through (GST) estimator to reduce the variance without incurring resampling overhead. This estimator is inspired by the essential properties of Straight-Through Gumbel-Softmax. We determine these properties and show via an ablation study that they are essential. Experiments demonstrate that the proposed GST estimator enjoys better performance compared to strong baselines on two discrete deep generative modeling tasks, MNIST-VAE and ListOps.", "bibtex": "@InProceedings{pmlr-v162-fan22a,\n title = \t {Training Discrete Deep Generative Models via Gapped Straight-Through Estimator},\n author = {Fan, Ting-Han and Chi, Ta-Chung and Rudnicky, Alexander I. and Ramadge, Peter J},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6059--6073},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fan22a/fan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/fan22a.html},\n abstract = \t {While deep generative models have succeeded in image processing, natural language processing, and reinforcement learning, training that involves discrete random variables remains challenging due to the high variance of its gradient estimation process. Monte Carlo is a common solution used in most variance reduction approaches. However, this involves time-consuming resampling and multiple function evaluations. We propose a Gapped Straight-Through (GST) estimator to reduce the variance without incurring resampling overhead. This estimator is inspired by the essential properties of Straight-Through Gumbel-Softmax. We determine these properties and show via an ablation study that they are essential. Experiments demonstrate that the proposed GST estimator enjoys better performance compared to strong baselines on two discrete deep generative modeling tasks, MNIST-VAE and ListOps.}\n}", "pdf": "https://proceedings.mlr.press/v162/fan22a/fan22a.pdf", "supp": "", "pdf_size": 1341350, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3212785124198988357&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Electrical and Computer Engineering, Princeton University; Language Technologies Institute, Carnegie Mellon University; Language Technologies Institute, Carnegie Mellon University; Department of Electrical and Computer Engineering, Princeton University", "aff_domain": "princeton.edu;andrew.cmu.edu; ; ", "email": "princeton.edu;andrew.cmu.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/fan22a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Princeton University;Carnegie Mellon University", "aff_unique_dep": "Department of Electrical and Computer Engineering;Language Technologies Institute", "aff_unique_url": "https://www.princeton.edu;https://www.cmu.edu", "aff_unique_abbr": "Princeton;CMU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Training OOD Detectors in their Natural Habitats", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16529", "id": "16529", "proceeding": "https://proceedings.mlr.press/v162/katz-samuels22a.html", "poster": "/media/PosterPDFs/ICML%202022/2b323d6eb28422cef49b266557dd31ad.png?t=1658158197.447113", "slides": "", "author_site": "Julian Katz-Samuels, Julia Nakhleh, Robert Nowak, Yixuan Li", "author": "Julian Katz-Samuels; Julia B Nakhleh; Robert Nowak; Yixuan Li", "abstract": "Out-of-distribution (OOD) detection is important for machine learning models deployed in the wild. Recent methods use auxiliary outlier data to regularize the model for improved OOD detection. However, these approaches make a strong distributional assumption that the auxiliary outlier data is completely separable from the in-distribution (ID) data. In this paper, we propose a novel framework that leverages wild mixture data\u2014that naturally consists of both ID and OOD samples. Such wild data is abundant and arises freely upon deploying a machine learning classifier in their natural habitats. Our key idea is to formulate a constrained optimization problem and to show how to tractably solve it. Our learning objective maximizes the OOD detection rate, subject to constraints on the classification error of ID data and on the OOD error rate of ID examples. We extensively evaluate our approach on common OOD detection tasks and demonstrate superior performance. Code is available at https://github.com/jkatzsam/woods_ood.", "bibtex": "@InProceedings{pmlr-v162-katz-samuels22a,\n title = \t {Training {OOD} Detectors in their Natural Habitats},\n author = {Katz-Samuels, Julian and Nakhleh, Julia B and Nowak, Robert and Li, Yixuan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10848--10865},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/katz-samuels22a/katz-samuels22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/katz-samuels22a.html},\n abstract = \t {Out-of-distribution (OOD) detection is important for machine learning models deployed in the wild. Recent methods use auxiliary outlier data to regularize the model for improved OOD detection. However, these approaches make a strong distributional assumption that the auxiliary outlier data is completely separable from the in-distribution (ID) data. In this paper, we propose a novel framework that leverages wild mixture data\u2014that naturally consists of both ID and OOD samples. Such wild data is abundant and arises freely upon deploying a machine learning classifier in their natural habitats. Our key idea is to formulate a constrained optimization problem and to show how to tractably solve it. Our learning objective maximizes the OOD detection rate, subject to constraints on the classification error of ID data and on the OOD error rate of ID examples. We extensively evaluate our approach on common OOD detection tasks and demonstrate superior performance. Code is available at https://github.com/jkatzsam/woods_ood.}\n}", "pdf": "https://proceedings.mlr.press/v162/katz-samuels22a/katz-samuels22a.pdf", "supp": "", "pdf_size": 1001790, "gs_citation": 115, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8582043463264170613&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Institute for Foundations of Data Science, University of Wisconsin, Madison; Department of Computer Sciences, University of Wisconsin, Madison; Department of Electrical and Computer Engineering, University of Wisconsin, Madison; Department of Computer Sciences, University of Wisconsin, Madison", "aff_domain": "wisc.edu; ; ; ", "email": "wisc.edu; ; ; ", "github": "https://github.com/jkatzsam/woods_ood", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/katz-samuels22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "Institute for Foundations of Data Science", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Training Your Sparse Neural Network Better with Any Mask", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16915", "id": "16915", "proceeding": "https://proceedings.mlr.press/v162/jaiswal22a.html", "poster": "/media/PosterPDFs/ICML%202022/0af787945872196b42c9f73ead2565c8.png?t=1657492585.822197", "slides": "", "author_site": "Ajay Jaiswal, Haoyu Ma, Tianlong Chen, Ying Ding, Zhangyang \u201cAtlas\u201d Wang", "author": "Ajay Kumar Jaiswal; Haoyu Ma; Tianlong Chen; Ying Ding; Zhangyang Wang", "abstract": "Pruning large neural networks to create high-quality, independently trainable sparse masks, which can maintain similar performance to their dense counterparts, is very desirable due to the reduced space and time complexity. As research effort is focused on increasingly sophisticated pruning methods that leads to sparse subnetworks trainable from the scratch, we argue for an orthogonal, under-explored theme: improving training techniques for pruned sub-networks, i.e. sparse training. Apart from the popular belief that only the quality of sparse masks matters for sparse training, in this paper we demonstrate an alternative opportunity: one can carefully customize the sparse training techniques to deviate from the default dense network training protocols, consisting of introducing \u201cghost\" neurons and skip connections at the early stage of training, and strategically modifying the initialization as well as labels. Our new sparse training recipe is generally applicable to improving training from scratch with various sparse masks. By adopting our newly curated techniques, we demonstrate significant performance gains across various popular datasets (CIFAR-10, CIFAR-100, TinyImageNet), architectures (ResNet-18/32/104, Vgg16, MobileNet), and sparse mask options (lottery ticket, SNIP/GRASP, SynFlow, or even randomly pruning), compared to the default training protocols, especially at high sparsity levels. Codes will be publicly available.", "bibtex": "@InProceedings{pmlr-v162-jaiswal22a,\n title = \t {Training Your Sparse Neural Network Better with Any Mask},\n author = {Jaiswal, Ajay Kumar and Ma, Haoyu and Chen, Tianlong and Ding, Ying and Wang, Zhangyang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9833--9844},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jaiswal22a/jaiswal22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/jaiswal22a.html},\n abstract = \t {Pruning large neural networks to create high-quality, independently trainable sparse masks, which can maintain similar performance to their dense counterparts, is very desirable due to the reduced space and time complexity. As research effort is focused on increasingly sophisticated pruning methods that leads to sparse subnetworks trainable from the scratch, we argue for an orthogonal, under-explored theme: improving training techniques for pruned sub-networks, i.e. sparse training. Apart from the popular belief that only the quality of sparse masks matters for sparse training, in this paper we demonstrate an alternative opportunity: one can carefully customize the sparse training techniques to deviate from the default dense network training protocols, consisting of introducing \u201cghost\" neurons and skip connections at the early stage of training, and strategically modifying the initialization as well as labels. Our new sparse training recipe is generally applicable to improving training from scratch with various sparse masks. By adopting our newly curated techniques, we demonstrate significant performance gains across various popular datasets (CIFAR-10, CIFAR-100, TinyImageNet), architectures (ResNet-18/32/104, Vgg16, MobileNet), and sparse mask options (lottery ticket, SNIP/GRASP, SynFlow, or even randomly pruning), compared to the default training protocols, especially at high sparsity levels. Codes will be publicly available.}\n}", "pdf": "https://proceedings.mlr.press/v162/jaiswal22a/jaiswal22a.pdf", "supp": "", "pdf_size": 824024, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17434761620518064417&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "The University of Texas at Austin; University of California, Irvine; The University of Texas at Austin; The University of Texas at Austin; The University of Texas at Austin", "aff_domain": "utexas.edu; ; ; ;utexas.edu", "email": "utexas.edu; ; ; ;utexas.edu", "github": "https://github.com/VITA-Group/ToST", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/jaiswal22a.html", "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "University of Texas at Austin;University of California, Irvine", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.uci.edu", "aff_unique_abbr": "UT Austin;UCI", "aff_campus_unique_index": "0;1;0;0;0", "aff_campus_unique": "Austin;Irvine", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Tranception: Protein Fitness Prediction with Autoregressive Transformers and Inference-time Retrieval", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18181", "id": "18181", "proceeding": "https://proceedings.mlr.press/v162/notin22a.html", "poster": "/media/PosterPDFs/ICML%202022/97ffcbd95363387c7e371563057eb02f.png?t=1657209228.9074683", "slides": "/media/icml-2022/Slides/18181.pdf", "author_site": "Pascal Notin, Mafalda Dias, Jonathan Frazer, Javier Marchena-Hurtado, Aidan Gomez, Debora Marks, Yarin Gal", "author": "Pascal Notin; Mafalda Dias; Jonathan Frazer; Javier Marchena-Hurtado; Aidan N Gomez; Debora Marks; Yarin Gal", "abstract": "The ability to accurately model the fitness landscape of protein sequences is critical to a wide range of applications, from quantifying the effects of human variants on disease likelihood, to predicting immune-escape mutations in viruses and designing novel biotherapeutic proteins. Deep generative models of protein sequences trained on multiple sequence alignments have been the most successful approaches so far to address these tasks. The performance of these methods is however contingent on the availability of sufficiently deep and diverse alignments for reliable training. Their potential scope is thus limited by the fact many protein families are hard, if not impossible, to align. Large language models trained on massive quantities of non-aligned protein sequences from diverse families address these problems and show potential to eventually bridge the performance gap. We introduce Tranception, a novel transformer architecture leveraging autoregressive predictions and retrieval of homologous sequences at inference to achieve state-of-the-art fitness prediction performance. Given its markedly higher performance on multiple mutants, robustness to shallow alignments and ability to score indels, our approach offers significant gain of scope over existing approaches. To enable more rigorous model testing across a broader range of protein families, we develop ProteinGym \u2013 an extensive set of multiplexed assays of variant effects, substantially increasing both the number and diversity of assays compared to existing benchmarks.", "bibtex": "@InProceedings{pmlr-v162-notin22a,\n title = \t {Tranception: Protein Fitness Prediction with Autoregressive Transformers and Inference-time Retrieval},\n author = {Notin, Pascal and Dias, Mafalda and Frazer, Jonathan and Marchena-Hurtado, Javier and Gomez, Aidan N and Marks, Debora and Gal, Yarin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16990--17017},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/notin22a/notin22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/notin22a.html},\n abstract = \t {The ability to accurately model the fitness landscape of protein sequences is critical to a wide range of applications, from quantifying the effects of human variants on disease likelihood, to predicting immune-escape mutations in viruses and designing novel biotherapeutic proteins. Deep generative models of protein sequences trained on multiple sequence alignments have been the most successful approaches so far to address these tasks. The performance of these methods is however contingent on the availability of sufficiently deep and diverse alignments for reliable training. Their potential scope is thus limited by the fact many protein families are hard, if not impossible, to align. Large language models trained on massive quantities of non-aligned protein sequences from diverse families address these problems and show potential to eventually bridge the performance gap. We introduce Tranception, a novel transformer architecture leveraging autoregressive predictions and retrieval of homologous sequences at inference to achieve state-of-the-art fitness prediction performance. Given its markedly higher performance on multiple mutants, robustness to shallow alignments and ability to score indels, our approach offers significant gain of scope over existing approaches. To enable more rigorous model testing across a broader range of protein families, we develop ProteinGym \u2013 an extensive set of multiplexed assays of variant effects, substantially increasing both the number and diversity of assays compared to existing benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/notin22a/notin22a.pdf", "supp": "", "pdf_size": 938560, "gs_citation": 237, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13139855140556717827&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "OATML Group, Department of Computer Science, University of Oxford, Oxford, UK+ Cohere, Toronto, Canada; Marks Group, Department of Systems Biology, Harvard Medical School, Boston, MA, USA; Marks Group, Department of Systems Biology, Harvard Medical School, Boston, MA, USA; Marks Group, Department of Systems Biology, Harvard Medical School, Boston, MA, USA; OATML Group, Department of Computer Science, University of Oxford, Oxford, UK+ Cohere, Toronto, Canada; Marks Group, Department of Systems Biology, Harvard Medical School, Boston, MA, USA; OATML Group, Department of Computer Science, University of Oxford, Oxford, UK", "aff_domain": "cs.ox.ac.uk; ; ; ; ; ; ", "email": "cs.ox.ac.uk; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/notin22a.html", "aff_unique_index": "0+1;2;2;2;0+1;2;0", "aff_unique_norm": "University of Oxford;Cohere;Harvard Medical School", "aff_unique_dep": "Department of Computer Science;;Department of Systems Biology", "aff_unique_url": "https://www.ox.ac.uk;https://cohere.ai;https://hms.harvard.edu", "aff_unique_abbr": "Oxford;;HMS", "aff_campus_unique_index": "0+1;2;2;2;0+1;2;0", "aff_campus_unique": "Oxford;Toronto;Boston", "aff_country_unique_index": "0+1;2;2;2;0+1;2;0", "aff_country_unique": "United Kingdom;Canada;United States" }, { "title": "Transfer Learning In Differential Privacy\u2019s Hybrid-Model", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16175", "id": "16175", "proceeding": "https://proceedings.mlr.press/v162/kohen22a.html", "poster": "/media/PosterPDFs/ICML%202022/a19744e268754fb0148b017647355b7b.png?t=1657173488.5894494", "slides": "", "author_site": "Refael Kohen, Or Sheffet", "author": "Refael Kohen; Or Sheffet", "abstract": "The", "bibtex": "@InProceedings{pmlr-v162-kohen22a,\n title = \t {Transfer Learning In Differential Privacy\u2019s Hybrid-Model},\n author = {Kohen, Refael and Sheffet, Or},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11413--11429},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kohen22a/kohen22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/kohen22a.html},\n abstract = \t {The", "pdf": "https://proceedings.mlr.press/v162/kohen22a/kohen22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/kohen22a-supp.zip", "pdf_size": 1776179, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17331234590146527214&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Faculty of Engineering, Bar-Ilan University, Israel; Faculty of Engineering, Bar-Ilan University, Israel", "aff_domain": "gmail.com;biu.ac.il", "email": "gmail.com;biu.ac.il", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/kohen22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Bar-Ilan University", "aff_unique_dep": "Faculty of Engineering", "aff_unique_url": "https://www.biu.ac.il", "aff_unique_abbr": "BIU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Transfer and Marginalize: Explaining Away Label Noise with Privileged Information", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17369", "id": "17369", "proceeding": "https://proceedings.mlr.press/v162/collier22a.html", "poster": "/media/PosterPDFs/ICML%202022/6c524f9d5d7027454a783c841250ba71.png?t=1658099170.0699549", "slides": "", "author_site": "Mark Collier, Rodolphe Jenatton, Efi Kokiopoulou, Jesse Berent", "author": "Mark Collier; Rodolphe Jenatton; Effrosyni Kokiopoulou; Jesse Berent", "abstract": "Supervised learning datasets often have privileged information, in the form of features which are available at training time but are not available at test time e.g. the ID of the annotator that provided the label. We argue that privileged information is useful for explaining away label noise, thereby reducing the harmful impact of noisy labels. We develop a simple and efficient method for supervised learning with neural networks: it transfers via weight sharing the knowledge learned with privileged information and approximately marginalizes over privileged information at test time. Our method, TRAM (TRansfer and Marginalize), has minimal training time overhead and has the same test-time cost as not using privileged information. TRAM performs strongly on CIFAR-10H, ImageNet and Civil Comments benchmarks.", "bibtex": "@InProceedings{pmlr-v162-collier22a,\n title = \t {Transfer and Marginalize: Explaining Away Label Noise with Privileged Information},\n author = {Collier, Mark and Jenatton, Rodolphe and Kokiopoulou, Effrosyni and Berent, Jesse},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4219--4237},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/collier22a/collier22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/collier22a.html},\n abstract = \t {Supervised learning datasets often have privileged information, in the form of features which are available at training time but are not available at test time e.g. the ID of the annotator that provided the label. We argue that privileged information is useful for explaining away label noise, thereby reducing the harmful impact of noisy labels. We develop a simple and efficient method for supervised learning with neural networks: it transfers via weight sharing the knowledge learned with privileged information and approximately marginalizes over privileged information at test time. Our method, TRAM (TRansfer and Marginalize), has minimal training time overhead and has the same test-time cost as not using privileged information. TRAM performs strongly on CIFAR-10H, ImageNet and Civil Comments benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/collier22a/collier22a.pdf", "supp": "", "pdf_size": 2036337, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11032101970683787492&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Google AI; Google AI; Google AI; Google AI", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/collier22a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google AI", "aff_unique_url": "https://ai.google", "aff_unique_abbr": "Google AI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Transformer Neural Processes: Uncertainty-Aware Meta Learning Via Sequence Modeling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17523", "id": "17523", "proceeding": "https://proceedings.mlr.press/v162/nguyen22b.html", "poster": "/media/PosterPDFs/ICML%202022/bf8dd8c68d02e161c28dc9ea139d4784_8KSxYQh.png?t=1657861353.4086485", "slides": "", "author_site": "Tung Nguyen, Aditya Grover", "author": "Tung Nguyen; Aditya Grover", "abstract": "Neural Processes (NPs) are a popular class of approaches for meta-learning. Similar to Gaussian Processes (GPs), NPs define distributions over functions and can estimate uncertainty in their predictions. However, unlike GPs, NPs and their variants suffer from underfitting and often have intractable likelihoods, which limit their applications in sequential decision making. We propose Transformer Neural Processes (TNPs), a new member of the NP family that casts uncertainty-aware meta learning as a sequence modeling problem. We learn TNPs via an autoregressive likelihood-based objective and instantiate it with a novel transformer-based architecture that respects the inductive biases inherent to the problem structure, such as invariance to the observed data points and equivariance to the unobserved points. We further design knobs within the TNP architecture to tradeoff the increase in expressivity of the decoding distribution with extra computation. Empirically, we show that TNPs achieve state-of-the-art performance on various benchmark problems, outperforming all previous NP variants on meta regression, image completion, contextual multi-armed bandits, and Bayesian optimization.", "bibtex": "@InProceedings{pmlr-v162-nguyen22b,\n title = \t {Transformer Neural Processes: Uncertainty-Aware Meta Learning Via Sequence Modeling},\n author = {Nguyen, Tung and Grover, Aditya},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16569--16594},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nguyen22b/nguyen22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/nguyen22b.html},\n abstract = \t {Neural Processes (NPs) are a popular class of approaches for meta-learning. Similar to Gaussian Processes (GPs), NPs define distributions over functions and can estimate uncertainty in their predictions. However, unlike GPs, NPs and their variants suffer from underfitting and often have intractable likelihoods, which limit their applications in sequential decision making. We propose Transformer Neural Processes (TNPs), a new member of the NP family that casts uncertainty-aware meta learning as a sequence modeling problem. We learn TNPs via an autoregressive likelihood-based objective and instantiate it with a novel transformer-based architecture that respects the inductive biases inherent to the problem structure, such as invariance to the observed data points and equivariance to the unobserved points. We further design knobs within the TNP architecture to tradeoff the increase in expressivity of the decoding distribution with extra computation. Empirically, we show that TNPs achieve state-of-the-art performance on various benchmark problems, outperforming all previous NP variants on meta regression, image completion, contextual multi-armed bandits, and Bayesian optimization.}\n}", "pdf": "https://proceedings.mlr.press/v162/nguyen22b/nguyen22b.pdf", "supp": "", "pdf_size": 17653800, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8314226561470238527&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, UCLA; Department of Computer Science, UCLA", "aff_domain": "cs.ucla.edu; ", "email": "cs.ucla.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/nguyen22b.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Transformer Quality in Linear Time", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17273", "id": "17273", "proceeding": "https://proceedings.mlr.press/v162/hua22a.html", "poster": "/media/PosterPDFs/ICML%202022/1f33d7cf6693dc6dcc7029b97cc29487_QLkxxP0.png?t=1657852589.0622714", "slides": "", "author_site": "Weizhe Hua, Zihang Dai, Hanxiao Liu, Quoc Le", "author": "Weizhe Hua; Zihang Dai; Hanxiao Liu; Quoc Le", "abstract": "We revisit the design choices in Transformers, and propose methods to address their weaknesses in handling long sequences. First, we propose a simple layer named gated attention unit, which allows the use of a weaker single-head attention with minimal quality loss. We then propose a linear approximation method complementary to this new layer, which is accelerator-friendly and highly competitive in quality. The resulting model, named FLASH, matches the perplexity of improved Transformers over both short (512) and long (8K) context lengths, achieving training speedups of up to 4.9x on Wiki-40B and 12.1x on PG-19 for auto-regressive language modeling, and 4.8x on C4 for masked language modeling.", "bibtex": "@InProceedings{pmlr-v162-hua22a,\n title = \t {Transformer Quality in Linear Time},\n author = {Hua, Weizhe and Dai, Zihang and Liu, Hanxiao and Le, Quoc},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9099--9117},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hua22a/hua22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hua22a.html},\n abstract = \t {We revisit the design choices in Transformers, and propose methods to address their weaknesses in handling long sequences. First, we propose a simple layer named gated attention unit, which allows the use of a weaker single-head attention with minimal quality loss. We then propose a linear approximation method complementary to this new layer, which is accelerator-friendly and highly competitive in quality. The resulting model, named FLASH, matches the perplexity of improved Transformers over both short (512) and long (8K) context lengths, achieving training speedups of up to 4.9x on Wiki-40B and 12.1x on PG-19 for auto-regressive language modeling, and 4.8x on C4 for masked language modeling.}\n}", "pdf": "https://proceedings.mlr.press/v162/hua22a/hua22a.pdf", "supp": "", "pdf_size": 725921, "gs_citation": 327, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9557741307862181596&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Cornell University+Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team", "aff_domain": "cornell.edu;google.com;google.com; ", "email": "cornell.edu;google.com;google.com; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/hua22a.html", "aff_unique_index": "0+1;1;1;1", "aff_unique_norm": "Cornell University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.cornell.edu;https://research.google", "aff_unique_abbr": "Cornell;Google", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Transformers are Meta-Reinforcement Learners", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17085", "id": "17085", "proceeding": "https://proceedings.mlr.press/v162/melo22a.html", "poster": "/media/PosterPDFs/ICML%202022/3d0236a18121ec86eff612e3b26bf7b9.png?t=1657838212.4403875", "slides": "", "author_site": "Luckeciano Melo", "author": "Luckeciano C Melo", "abstract": "The transformer architecture and variants presented a remarkable success across many machine learning tasks in recent years. This success is intrinsically related to the capability of handling long sequences and the presence of context-dependent weights from the attention mechanism. We argue that these capabilities suit the central role of a Meta-Reinforcement Learning algorithm. Indeed, a meta-RL agent needs to infer the task from a sequence of trajectories. Furthermore, it requires a fast adaptation strategy to adapt its policy for a new task - which can be achieved using the self-attention mechanism. In this work, we present TrMRL (Transformers for Meta-Reinforcement Learning), a meta-RL agent that mimics the memory reinstatement mechanism using the transformer architecture. It associates the recent past of working memories to build an episodic memory recursively through the transformer layers. We show that the self-attention computes a consensus representation that minimizes the Bayes Risk at each layer and provides meaningful features to compute the best actions. We conducted experiments in high-dimensional continuous control environments for locomotion and dexterous manipulation. Results show that TrMRL presents comparable or superior asymptotic performance, sample efficiency, and out-of-distribution generalization compared to the baselines in these environments.", "bibtex": "@InProceedings{pmlr-v162-melo22a,\n title = \t {Transformers are Meta-Reinforcement Learners},\n author = {Melo, Luckeciano C},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15340--15359},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/melo22a/melo22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/melo22a.html},\n abstract = \t {The transformer architecture and variants presented a remarkable success across many machine learning tasks in recent years. This success is intrinsically related to the capability of handling long sequences and the presence of context-dependent weights from the attention mechanism. We argue that these capabilities suit the central role of a Meta-Reinforcement Learning algorithm. Indeed, a meta-RL agent needs to infer the task from a sequence of trajectories. Furthermore, it requires a fast adaptation strategy to adapt its policy for a new task - which can be achieved using the self-attention mechanism. In this work, we present TrMRL (Transformers for Meta-Reinforcement Learning), a meta-RL agent that mimics the memory reinstatement mechanism using the transformer architecture. It associates the recent past of working memories to build an episodic memory recursively through the transformer layers. We show that the self-attention computes a consensus representation that minimizes the Bayes Risk at each layer and provides meaningful features to compute the best actions. We conducted experiments in high-dimensional continuous control environments for locomotion and dexterous manipulation. Results show that TrMRL presents comparable or superior asymptotic performance, sample efficiency, and out-of-distribution generalization compared to the baselines in these environments.}\n}", "pdf": "https://proceedings.mlr.press/v162/melo22a/melo22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/melo22a-supp.zip", "pdf_size": 1992436, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4334650228414799916&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Microsoft, USA+Center of Excellence in Artificial Intelligence (Deep Learning Brazil), Brazil", "aff_domain": "gmail.com", "email": "gmail.com", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v162/melo22a.html", "aff_unique_index": "0+1", "aff_unique_norm": "Microsoft;Center of Excellence in Artificial Intelligence", "aff_unique_dep": "Microsoft Corporation;Deep Learning Brazil", "aff_unique_url": "https://www.microsoft.com;", "aff_unique_abbr": "Microsoft;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1", "aff_country_unique": "United States;Brazil" }, { "title": "Translating Robot Skills: Learning Unsupervised Skill Correspondences Across Robots", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16261", "id": "16261", "proceeding": "https://proceedings.mlr.press/v162/shankar22a.html", "poster": "/media/PosterPDFs/ICML%202022/c92a10324374fac681719d63979d00fe.png?t=1657740527.4788315", "slides": "", "author_site": "Tanmay Shankar, Yixin Lin, Aravind Rajeswaran, Vikash Kumar, Stuart Anderson, Jean Oh", "author": "Tanmay Shankar; Yixin Lin; Aravind Rajeswaran; Vikash Kumar; Stuart Anderson; Jean Oh", "abstract": "In this paper, we explore how we can endow robots with the ability to learn correspondences between their own skills, and those of morphologically different robots in different domains, in an entirely unsupervised manner. We make the insight that different morphological robots use similar task strategies to solve similar tasks. Based on this insight, we frame learning skill correspondences as a problem of matching distributions of sequences of skills across robots. We then present an unsupervised objective that encourages a learnt skill translation model to match these distributions across domains, inspired by recent advances in unsupervised machine translation. Our approach is able to learn semantically meaningful correspondences between skills across multiple robot-robot and human-robot domain pairs despite being completely unsupervised. Further, the learnt correspondences enable the transfer of task strategies across robots and domains. We present dynamic visualizations of our results at https://sites.google.com/view/translatingrobotskills/home.", "bibtex": "@InProceedings{pmlr-v162-shankar22a,\n title = \t {Translating Robot Skills: Learning Unsupervised Skill Correspondences Across Robots},\n author = {Shankar, Tanmay and Lin, Yixin and Rajeswaran, Aravind and Kumar, Vikash and Anderson, Stuart and Oh, Jean},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19626--19644},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shankar22a/shankar22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/shankar22a.html},\n abstract = \t {In this paper, we explore how we can endow robots with the ability to learn correspondences between their own skills, and those of morphologically different robots in different domains, in an entirely unsupervised manner. We make the insight that different morphological robots use similar task strategies to solve similar tasks. Based on this insight, we frame learning skill correspondences as a problem of matching distributions of sequences of skills across robots. We then present an unsupervised objective that encourages a learnt skill translation model to match these distributions across domains, inspired by recent advances in unsupervised machine translation. Our approach is able to learn semantically meaningful correspondences between skills across multiple robot-robot and human-robot domain pairs despite being completely unsupervised. Further, the learnt correspondences enable the transfer of task strategies across robots and domains. We present dynamic visualizations of our results at https://sites.google.com/view/translatingrobotskills/home.}\n}", "pdf": "https://proceedings.mlr.press/v162/shankar22a/shankar22a.pdf", "supp": "", "pdf_size": 5767225, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17491515858263634565&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Robotics Institute, Carnegie Mellon University, Pittsburgh, PA, USA+Meta AI Research, Pittsburgh, PA, USA; Meta AI Research, Pittsburgh, PA, USA; Meta AI Research, Pittsburgh, PA, USA; Meta AI Research, Pittsburgh, PA, USA; Meta AI Research, Pittsburgh, PA, USA; Robotics Institute, Carnegie Mellon University, Pittsburgh, PA, USA", "aff_domain": "gmail.com; ; ; ; ; ", "email": "gmail.com; ; ; ; ; ", "github": "", "project": "https://sites.google.com/view/translatingrobotskills/home", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/shankar22a.html", "aff_unique_index": "0+1;1;1;1;1;0", "aff_unique_norm": "Carnegie Mellon University;Meta", "aff_unique_dep": "Robotics Institute;Meta AI Research", "aff_unique_url": "https://www.cmu.edu;", "aff_unique_abbr": "CMU;", "aff_campus_unique_index": "0+0;0;0;0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0+0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Translatotron 2: High-quality direct speech-to-speech translation with voice preservation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16401", "id": "16401", "proceeding": "https://proceedings.mlr.press/v162/jia22b.html", "poster": "/media/PosterPDFs/ICML%202022/97af4fb322bb5c8973ade16764156bed.png?t=1657839251.107762", "slides": "/media/icml-2022/Slides/16401.pdf", "author_site": "Ye Jia, Michelle Tadmor Ramanovich, Tal Remez, Roi Pomerantz", "author": "Ye Jia; Michelle Tadmor Ramanovich; Tal Remez; Roi Pomerantz", "abstract": "We present Translatotron 2, a neural direct speech-to-speech translation model that can be trained end-to-end. Translatotron 2 consists of a speech encoder, a linguistic decoder, an acoustic synthesizer, and a single attention module that connects them together. Experimental results on three datasets consistently show that Translatotron 2 outperforms the original Translatotron by a large margin on both translation quality (up to +15.5 BLEU) and speech generation quality, and approaches the same of cascade systems. In addition, we propose a simple method for preserving speakers\u2019 voices from the source speech to the translation speech in a different language. Unlike existing approaches, the proposed method is able to preserve each speaker\u2019s voice on speaker turns without requiring for speaker segmentation. Furthermore, compared to existing approaches, it better preserves speaker\u2019s privacy and mitigates potential misuse of voice cloning for creating spoofing audio artifacts.", "bibtex": "@InProceedings{pmlr-v162-jia22b,\n title = \t {Translatotron 2: High-quality direct speech-to-speech translation with voice preservation},\n author = {Jia, Ye and Ramanovich, Michelle Tadmor and Remez, Tal and Pomerantz, Roi},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {10120--10134},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/jia22b/jia22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/jia22b.html},\n abstract = \t {We present Translatotron 2, a neural direct speech-to-speech translation model that can be trained end-to-end. Translatotron 2 consists of a speech encoder, a linguistic decoder, an acoustic synthesizer, and a single attention module that connects them together. Experimental results on three datasets consistently show that Translatotron 2 outperforms the original Translatotron by a large margin on both translation quality (up to +15.5 BLEU) and speech generation quality, and approaches the same of cascade systems. In addition, we propose a simple method for preserving speakers\u2019 voices from the source speech to the translation speech in a different language. Unlike existing approaches, the proposed method is able to preserve each speaker\u2019s voice on speaker turns without requiring for speaker segmentation. Furthermore, compared to existing approaches, it better preserves speaker\u2019s privacy and mitigates potential misuse of voice cloning for creating spoofing audio artifacts.}\n}", "pdf": "https://proceedings.mlr.press/v162/jia22b/jia22b.pdf", "supp": "", "pdf_size": 1027765, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12151245660410247961&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Google Research; Google Research; Google Research; Google Research", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/jia22b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "UAST: Uncertainty-Aware Siamese Tracking", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17463", "id": "17463", "proceeding": "https://proceedings.mlr.press/v162/zhang22g.html", "poster": "/media/PosterPDFs/ICML%202022/a554f89dd61cabd2ff833d3468e2008a.png?t=1657264051.0049815", "slides": "", "author_site": "Dawei Zhang, Yanwei Fu, Zhonglong Zheng", "author": "Dawei Zhang; Yanwei Fu; Zhonglong Zheng", "abstract": "Visual object tracking is basically formulated as target classification and bounding box estimation. Recent anchor-free Siamese trackers rely on predicting the distances to four sides for efficient regression but fail to estimate accurate bounding box in complex scenes. We argue that these approaches lack a clear probabilistic explanation, so it is desirable to model the uncertainty and ambiguity representation of target estimation. To address this issue, this paper presents an Uncertainty-Aware Siamese Tracker (UAST) by developing a novel distribution-based regression formulation with localization uncertainty. We exploit regression vectors to directly represent the discretized probability distribution for four offsets of boxes, which is general, flexible and informative. Based on the resulting distributed representation, our method is able to provide a probabilistic value of uncertainty. Furthermore, considering the high correlation between the uncertainty and regression accuracy, we propose to learn a joint representation head of classification and localization quality for reliable tracking, which also avoids the inconsistency of classification and quality estimation between training and inference. Extensive experiments on several challenging tracking benchmarks demonstrate the effectiveness of UAST and its superiority over other Siamese trackers.", "bibtex": "@InProceedings{pmlr-v162-zhang22g,\n title = \t {{UAST}: Uncertainty-Aware Siamese Tracking},\n author = {Zhang, Dawei and Fu, Yanwei and Zheng, Zhonglong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26161--26175},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22g/zhang22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22g.html},\n abstract = \t {Visual object tracking is basically formulated as target classification and bounding box estimation. Recent anchor-free Siamese trackers rely on predicting the distances to four sides for efficient regression but fail to estimate accurate bounding box in complex scenes. We argue that these approaches lack a clear probabilistic explanation, so it is desirable to model the uncertainty and ambiguity representation of target estimation. To address this issue, this paper presents an Uncertainty-Aware Siamese Tracker (UAST) by developing a novel distribution-based regression formulation with localization uncertainty. We exploit regression vectors to directly represent the discretized probability distribution for four offsets of boxes, which is general, flexible and informative. Based on the resulting distributed representation, our method is able to provide a probabilistic value of uncertainty. Furthermore, considering the high correlation between the uncertainty and regression accuracy, we propose to learn a joint representation head of classification and localization quality for reliable tracking, which also avoids the inconsistency of classification and quality estimation between training and inference. Extensive experiments on several challenging tracking benchmarks demonstrate the effectiveness of UAST and its superiority over other Siamese trackers.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22g/zhang22g.pdf", "supp": "", "pdf_size": 3964136, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14143444198041030969&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "College of Mathematics and Computer Science, Zhejiang Normal University, Jinhua, China; School of Data Science, Fudan University, Shanghai, China; Key Laboratory of Intelligent Education Technology and Application of Zhejiang Province, Zhejiang Normal University, Jinhua, China", "aff_domain": "zjnu.edu.cn;fudan.edu.cn;zjnu.edu.cn", "email": "zjnu.edu.cn;fudan.edu.cn;zjnu.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/zhang22g.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Zhejiang Normal University;Fudan University", "aff_unique_dep": "College of Mathematics and Computer Science;School of Data Science", "aff_unique_url": "http://www.zjnu.edu.cn;https://www.fudan.edu.cn", "aff_unique_abbr": "ZJNU;Fudan", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Jinhua;Shanghai", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "UNIREX: A Unified Learning Framework for Language Model Rationale Extraction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17777", "id": "17777", "proceeding": "https://proceedings.mlr.press/v162/chan22a.html", "poster": "/media/PosterPDFs/ICML%202022/ebad33b3c9fa1d10327bb55f9e79e2f3.png?t=1657931689.7264936", "slides": "", "author_site": "Aaron Chan, Maziar Sanjabi, Lambert Mathias, Liang Tan, Shaoliang Nie, Xiaochang Peng, Xiang Ren, Hamed Firooz", "author": "Aaron Chan; Maziar Sanjabi; Lambert Mathias; Liang Tan; Shaoliang Nie; Xiaochang Peng; Xiang Ren; Hamed Firooz", "abstract": "An extractive rationale explains a language model\u2019s (LM\u2019s) prediction on a given task instance by highlighting the text inputs that most influenced the prediction. Ideally, rationale extraction should be faithful (reflective of LM\u2019s actual behavior) and plausible (convincing to humans), without compromising the LM\u2019s (i.e., task model\u2019s) task performance. Although attribution algorithms and select-predict pipelines are commonly used in rationale extraction, they both rely on certain heuristics that hinder them from satisfying all three desiderata. In light of this, we propose UNIREX, a flexible learning framework which generalizes rationale extractor optimization as follows: (1) specify architecture for a learned rationale extractor; (2) select explainability objectives (\\ie faithfulness and plausibility criteria); and (3) jointly train the task model and rationale extractor on the task using selected objectives. UNIREX enables replacing prior works\u2019 heuristic design choices with a generic learned rationale extractor in (1) and optimizing it for all three desiderata in (2)-(3). To facilitate comparison between methods w.r.t. multiple desiderata, we introduce the Normalized Relative Gain (NRG) metric. On five English text classification datasets, our best UNIREX configuration outperforms baselines by an average of 32.9% NRG. Plus, UNIREX rationale extractors\u2019 faithfulness can even generalize to unseen datasets and tasks.", "bibtex": "@InProceedings{pmlr-v162-chan22a,\n title = \t {{UNIREX}: A Unified Learning Framework for Language Model Rationale Extraction},\n author = {Chan, Aaron and Sanjabi, Maziar and Mathias, Lambert and Tan, Liang and Nie, Shaoliang and Peng, Xiaochang and Ren, Xiang and Firooz, Hamed},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2867--2889},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chan22a/chan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/chan22a.html},\n abstract = \t {An extractive rationale explains a language model\u2019s (LM\u2019s) prediction on a given task instance by highlighting the text inputs that most influenced the prediction. Ideally, rationale extraction should be faithful (reflective of LM\u2019s actual behavior) and plausible (convincing to humans), without compromising the LM\u2019s (i.e., task model\u2019s) task performance. Although attribution algorithms and select-predict pipelines are commonly used in rationale extraction, they both rely on certain heuristics that hinder them from satisfying all three desiderata. In light of this, we propose UNIREX, a flexible learning framework which generalizes rationale extractor optimization as follows: (1) specify architecture for a learned rationale extractor; (2) select explainability objectives (\\ie faithfulness and plausibility criteria); and (3) jointly train the task model and rationale extractor on the task using selected objectives. UNIREX enables replacing prior works\u2019 heuristic design choices with a generic learned rationale extractor in (1) and optimizing it for all three desiderata in (2)-(3). To facilitate comparison between methods w.r.t. multiple desiderata, we introduce the Normalized Relative Gain (NRG) metric. On five English text classification datasets, our best UNIREX configuration outperforms baselines by an average of 32.9% NRG. Plus, UNIREX rationale extractors\u2019 faithfulness can even generalize to unseen datasets and tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/chan22a/chan22a.pdf", "supp": "", "pdf_size": 1158166, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7352055260763393065&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "University of Southern California; Meta AI; Meta AI; Meta AI; Meta AI; Meta AI; University of Southern California; Meta AI", "aff_domain": "usc.edu; ; ; ; ; ; ; ", "email": "usc.edu; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/chan22a.html", "aff_unique_index": "0;1;1;1;1;1;0;1", "aff_unique_norm": "University of Southern California;Meta", "aff_unique_dep": ";Meta AI", "aff_unique_url": "https://www.usc.edu;https://meta.com", "aff_unique_abbr": "USC;Meta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Unaligned Supervision for Automatic Music Transcription in The Wild", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17847", "id": "17847", "proceeding": "https://proceedings.mlr.press/v162/maman22a.html", "poster": "/media/PosterPDFs/ICML%202022/0f96613235062963ccde717b18f97592_uL2WThG.png?t=1656686591.6707377", "slides": "/media/icml-2022/Slides/17847.pdf", "author_site": "Ben Maman, Amit Bermano", "author": "Ben Maman; Amit H Bermano", "abstract": "Multi-instrument Automatic Music Transcription (AMT), or the decoding of a musical recording into semantic musical content, is one of the holy grails of Music Information Retrieval. Current AMT approaches are restricted to piano and (some) guitar recordings, due to difficult data collection. In order to overcome data collection barriers, previous AMT approaches attempt to employ musical scores in the form of a digitized version of the same song or piece. The scores are typically aligned using audio features and strenuous human intervention to generate training labels. We introduce Note$_{EM}$, a method for simultaneously training a transcriber and aligning the scores to their corresponding performances, in a fully-automated process. Using this unaligned supervision scheme, complemented by pseudo-labels and pitch shift augmentation, our method enables training on in-the-wild recordings with unprecedented accuracy and instrumental variety. Using only synthetic data and unaligned supervision, we report SOTA note-level accuracy of the MAPS dataset, and large favorable margins on cross-dataset evaluations. We also demonstrate robustness and ease of use; we report comparable results when training on a small, easily obtainable, self-collected dataset, and we propose alternative labeling to the MusicNet dataset, which we show to be more accurate. Our project page is available at https://benadar293.github.io.", "bibtex": "@InProceedings{pmlr-v162-maman22a,\n title = \t {Unaligned Supervision for Automatic Music Transcription in The Wild},\n author = {Maman, Ben and Bermano, Amit H},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14918--14934},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/maman22a/maman22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/maman22a.html},\n abstract = \t {Multi-instrument Automatic Music Transcription (AMT), or the decoding of a musical recording into semantic musical content, is one of the holy grails of Music Information Retrieval. Current AMT approaches are restricted to piano and (some) guitar recordings, due to difficult data collection. In order to overcome data collection barriers, previous AMT approaches attempt to employ musical scores in the form of a digitized version of the same song or piece. The scores are typically aligned using audio features and strenuous human intervention to generate training labels. We introduce Note$_{EM}$, a method for simultaneously training a transcriber and aligning the scores to their corresponding performances, in a fully-automated process. Using this unaligned supervision scheme, complemented by pseudo-labels and pitch shift augmentation, our method enables training on in-the-wild recordings with unprecedented accuracy and instrumental variety. Using only synthetic data and unaligned supervision, we report SOTA note-level accuracy of the MAPS dataset, and large favorable margins on cross-dataset evaluations. We also demonstrate robustness and ease of use; we report comparable results when training on a small, easily obtainable, self-collected dataset, and we propose alternative labeling to the MusicNet dataset, which we show to be more accurate. Our project page is available at https://benadar293.github.io.}\n}", "pdf": "https://proceedings.mlr.press/v162/maman22a/maman22a.pdf", "supp": "", "pdf_size": 718625, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7612759621426730574&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "School of Computer Science, Tel Aviv University, Israel; School of Computer Science, Tel Aviv University, Israel", "aff_domain": "mail.tau.ac.il;tauex.tau.ac.il", "email": "mail.tau.ac.il;tauex.tau.ac.il", "github": "", "project": "https://benadar293.github.io", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/maman22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Tel Aviv University", "aff_unique_dep": "School of Computer Science", "aff_unique_url": "https://www.tau.ac.il", "aff_unique_abbr": "TAU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Tel Aviv", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Uncertainty Modeling in Generative Compressed Sensing", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16577", "id": "16577", "proceeding": "https://proceedings.mlr.press/v162/zhang22ai.html", "poster": "/media/PosterPDFs/ICML%202022/347665597cbfaef834886adbb848011f_bt0ebTa.png?t=1656350477.2560499", "slides": "/media/icml-2022/Slides/16577.pdf", "author_site": "Yilang Zhang, Mengchu Xu, Xiaojun Mao, Jian Wang", "author": "Yilang Zhang; Mengchu Xu; Xiaojun Mao; Jian Wang", "abstract": "Compressed sensing (CS) aims to recover a high-dimensional signal with structural priors from its low-dimensional linear measurements. Inspired by the huge success of deep neural networks in modeling the priors of natural signals, generative neural networks have been recently used to replace the hand-crafted structural priors in CS. However, the reconstruction capability of the generative model is fundamentally limited by the range of its generator, typically a small subset of the signal space of interest. To break this bottleneck and thus reconstruct those out-of-range signals, this paper presents a novel method called CS-BGM that can effectively expands the range of generator. Specifically, CS-BGM introduces uncertainties to the latent variable and parameters of the generator, while adopting the variational inference (VI) and maximum a posteriori (MAP) to infer them. Theoretical analysis demonstrates that expanding the range of generators is necessary for reducing the reconstruction error in generative CS. Extensive experiments show a consistent improvement of CS-BGM over the baselines.", "bibtex": "@InProceedings{pmlr-v162-zhang22ai,\n title = \t {Uncertainty Modeling in Generative Compressed Sensing},\n author = {Zhang, Yilang and Xu, Mengchu and Mao, Xiaojun and Wang, Jian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26655--26668},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22ai/zhang22ai.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22ai.html},\n abstract = \t {Compressed sensing (CS) aims to recover a high-dimensional signal with structural priors from its low-dimensional linear measurements. Inspired by the huge success of deep neural networks in modeling the priors of natural signals, generative neural networks have been recently used to replace the hand-crafted structural priors in CS. However, the reconstruction capability of the generative model is fundamentally limited by the range of its generator, typically a small subset of the signal space of interest. To break this bottleneck and thus reconstruct those out-of-range signals, this paper presents a novel method called CS-BGM that can effectively expands the range of generator. Specifically, CS-BGM introduces uncertainties to the latent variable and parameters of the generator, while adopting the variational inference (VI) and maximum a posteriori (MAP) to infer them. Theoretical analysis demonstrates that expanding the range of generators is necessary for reducing the reconstruction error in generative CS. Extensive experiments show a consistent improvement of CS-BGM over the baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22ai/zhang22ai.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/zhang22ai-supp.zip", "pdf_size": 4529503, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5279387903738965754&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "School of Data Science, Fudan University, Shanghai, China; School of Data Science, Fudan University, Shanghai, China; School of Mathematical Sciences, Shanghai Jiao Tong University, Shanghai, China; School of Data Science, Fudan University, Shanghai, China", "aff_domain": "fudan.edu.cn;fudan.edu.cn;sjtu.edu.cn;fudan.edu.cn", "email": "fudan.edu.cn;fudan.edu.cn;sjtu.edu.cn;fudan.edu.cn", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhang22ai.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Fudan University;Shanghai Jiao Tong University", "aff_unique_dep": "School of Data Science;School of Mathematical Sciences", "aff_unique_url": "https://www.fudan.edu.cn;https://www.sjtu.edu.cn", "aff_unique_abbr": "Fudan;SJTU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "UnderGrad: A Universal Black-Box Optimization Method with Almost Dimension-Free Convergence Rate Guarantees", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17903", "id": "17903", "proceeding": "https://proceedings.mlr.press/v162/antonakopoulos22b.html", "poster": "/media/PosterPDFs/ICML%202022/4cef5b5e6ff1b3445db4c013f1d452e0_SZxXhMt.png?t=1659135101.2896457", "slides": "", "author_site": "Kimon Antonakopoulos, Dong Quan Vu, Volkan Cevher, Kfir Levy, Panayotis Mertikopoulos", "author": "Kimon Antonakopoulos; Dong Quan Vu; Volkan Cevher; Kfir Levy; Panayotis Mertikopoulos", "abstract": "Universal methods achieve optimal convergence rate guarantees in convex optimization without any prior knowledge of the problem\u2019s regularity parameters or the attributes of the gradient oracle employed by the method. In this regard, existing state-of-the-art algorithms achieve an $O(1/T^2)$ convergence rate in Lipschitz smooth problems with a perfect gradient oracle, and an $O(1/sqrt{T})$ convergence speed when the underlying problem is non-smooth and/or the gradient oracle is stochastic. On the downside, these methods do not take into account the dependence of these guarantees on the problem\u2019s dimensionality, and this can have a catastrophic impact on a method\u2019s convergence, in both theory and practice. Our paper aims to bridge this gap by providing a scalable universal method - dubbed UnDERGrad - which enjoys an almost dimension-free oracle complexity in problems with a favorable geometry (like the simplex, $\\ell_1$-ball or trace-constraints), while retaining the order-optimal dependence on T described above. These \"best of both worlds\" guarantees are achieved via a primal-dual update scheme inspired by the dual exploration method for variational inequalities.", "bibtex": "@InProceedings{pmlr-v162-antonakopoulos22b,\n title = \t {{U}nder{G}rad: A Universal Black-Box Optimization Method with Almost Dimension-Free Convergence Rate Guarantees},\n author = {Antonakopoulos, Kimon and Vu, Dong Quan and Cevher, Volkan and Levy, Kfir and Mertikopoulos, Panayotis},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {772--795},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/antonakopoulos22b/antonakopoulos22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/antonakopoulos22b.html},\n abstract = \t {Universal methods achieve optimal convergence rate guarantees in convex optimization without any prior knowledge of the problem\u2019s regularity parameters or the attributes of the gradient oracle employed by the method. In this regard, existing state-of-the-art algorithms achieve an $O(1/T^2)$ convergence rate in Lipschitz smooth problems with a perfect gradient oracle, and an $O(1/sqrt{T})$ convergence speed when the underlying problem is non-smooth and/or the gradient oracle is stochastic. On the downside, these methods do not take into account the dependence of these guarantees on the problem\u2019s dimensionality, and this can have a catastrophic impact on a method\u2019s convergence, in both theory and practice. Our paper aims to bridge this gap by providing a scalable universal method - dubbed UnDERGrad - which enjoys an almost dimension-free oracle complexity in problems with a favorable geometry (like the simplex, $\\ell_1$-ball or trace-constraints), while retaining the order-optimal dependence on T described above. These \"best of both worlds\" guarantees are achieved via a primal-dual update scheme inspired by the dual exploration method for variational inequalities.}\n}", "pdf": "https://proceedings.mlr.press/v162/antonakopoulos22b/antonakopoulos22b.pdf", "supp": "", "pdf_size": 1798355, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7141871102858312959&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Laboratory for Information and Inference Systems, IEL STI EPFL, 1015 Lausanne, Switzerland + Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LIG, 38000 Grenoble, France; Safran Tech, Magny-Les-Hameaux, France + Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LIG, 38000 Grenoble, France; Laboratory for Information and Inference Systems, IEL STI EPFL, 1015 Lausanne, Switzerland; Technion, Haifa, Israel + A Viterbi Fellow; Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LIG, 38000 Grenoble, France + Criteo AI Lab", "aff_domain": "epfl.ch;safrangroup.com; ;technion.ac.il;univ-grenoble-alpes.fr", "email": "epfl.ch;safrangroup.com; ;technion.ac.il;univ-grenoble-alpes.fr", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/antonakopoulos22b.html", "aff_unique_index": "0+1;2+1;0;3+4;1+5", "aff_unique_norm": "EPFL;Universite Grenoble Alpes;Safran Tech;Technion - Israel Institute of Technology;University of Southern California;Criteo", "aff_unique_dep": "IEL STI;;;;Viterbi School of Engineering;Criteo AI Lab", "aff_unique_url": "https://www.epfl.ch;https://www.univ-grenoble-alpes.fr;https://www.safrantech.com;https://www.technion.ac.il/en/;https://viterbi.usc.edu;https://www.criteo.com", "aff_unique_abbr": "EPFL;UGA;;Technion;USC;Criteo", "aff_campus_unique_index": "0+1;2+1;0;3;1", "aff_campus_unique": "Lausanne;Grenoble;Magny-Les-Hameaux;Haifa;", "aff_country_unique_index": "0+1;1+1;0;2+3;1+1", "aff_country_unique": "Switzerland;France;Israel;United States" }, { "title": "Understanding Clipping for Federated Learning: Convergence and Client-Level Differential Privacy", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16803", "id": "16803", "proceeding": "https://proceedings.mlr.press/v162/zhang22b.html", "poster": "/media/PosterPDFs/ICML%202022/ca0daec69b5adc880fb464895726dbdf.png?t=1657735524.3513057", "slides": "", "author_site": "xinwei zhang, Xiangyi Chen, Mingyi Hong, Steven Wu, Jinfeng Yi", "author": "Xinwei Zhang; Xiangyi Chen; Mingyi Hong; Steven Wu; Jinfeng Yi", "abstract": "Providing privacy protection has been one of the primary motivations of Federated Learning (FL). Recently, there has been a line of work on incorporating the formal privacy notion of differential privacy with FL. To guarantee the client-level differential privacy in FL algorithms, the clients\u2019 transmitted model updates have to be clipped before adding privacy noise. Such clipping operation is substantially different from its counterpart of gradient clipping in the centralized differentially private SGD and has not been well-understood. In this paper, we first empirically demonstrate that the clipped FedAvg can perform surprisingly well even with substantial data heterogeneity when training neural networks, which is partly because the clients\u2019 updates become similar for several popular deep architectures. Based on this key observation, we provide the convergence analysis of a differential private (DP) FedAvg algorithm and highlight the relationship between clipping bias and the distribution of the clients\u2019 updates. To the best of our knowledge, this is the first work that rigorously investigates theoretical and empirical issues regarding the clipping operation in FL algorithms.", "bibtex": "@InProceedings{pmlr-v162-zhang22b,\n title = \t {Understanding Clipping for Federated Learning: Convergence and Client-Level Differential Privacy},\n author = {Zhang, Xinwei and Chen, Xiangyi and Hong, Mingyi and Wu, Steven and Yi, Jinfeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26048--26067},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22b/zhang22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22b.html},\n abstract = \t {Providing privacy protection has been one of the primary motivations of Federated Learning (FL). Recently, there has been a line of work on incorporating the formal privacy notion of differential privacy with FL. To guarantee the client-level differential privacy in FL algorithms, the clients\u2019 transmitted model updates have to be clipped before adding privacy noise. Such clipping operation is substantially different from its counterpart of gradient clipping in the centralized differentially private SGD and has not been well-understood. In this paper, we first empirically demonstrate that the clipped FedAvg can perform surprisingly well even with substantial data heterogeneity when training neural networks, which is partly because the clients\u2019 updates become similar for several popular deep architectures. Based on this key observation, we provide the convergence analysis of a differential private (DP) FedAvg algorithm and highlight the relationship between clipping bias and the distribution of the clients\u2019 updates. To the best of our knowledge, this is the first work that rigorously investigates theoretical and empirical issues regarding the clipping operation in FL algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhang22b/zhang22b.pdf", "supp": "", "pdf_size": 36033989, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4917010174884492209&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Electrical and Computer Engineering, University of Minnesota, MN, United States; Department of Electrical and Computer Engineering, University of Minnesota, MN, United States; Department of Electrical and Computer Engineering, University of Minnesota, MN, United States; School of Computer Science, Carnegie Mellon University, PA, United States; JD.com, Inc., Shanghai, China", "aff_domain": "umn.edu; ;umn.edu; ; ", "email": "umn.edu; ;umn.edu; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zhang22b.html", "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Minnesota;Carnegie Mellon University;JD.com, Inc.", "aff_unique_dep": "Department of Electrical and Computer Engineering;School of Computer Science;", "aff_unique_url": "https://www.umn.edu;https://www.cmu.edu;https://www.jd.com", "aff_unique_abbr": "UMN;CMU;JD.com", "aff_campus_unique_index": "0;0;0;1;2", "aff_campus_unique": "Minneapolis;Pittsburgh;Shanghai", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "United States;China" }, { "title": "Understanding Contrastive Learning Requires Incorporating Inductive Biases", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18173", "id": "18173", "proceeding": "https://proceedings.mlr.press/v162/saunshi22a.html", "poster": "/media/PosterPDFs/ICML%202022/0e1bacf07b14673fcdb553da51b999a5_Qmtui0M.png?t=1657836066.4913473", "slides": "", "author_site": "Nikunj Umesh Saunshi, Jordan Ash, Surbhi Goel, Dipendra Kumar Misra, Cyril Zhang, Sanjeev Arora, Sham Kakade, Akshay Krishnamurthy", "author": "Nikunj Saunshi; Jordan Ash; Surbhi Goel; Dipendra Misra; Cyril Zhang; Sanjeev Arora; Sham Kakade; Akshay Krishnamurthy", "abstract": "Contrastive learning is a popular form of self-supervised learning that encourages augmentations (views) of the same input to have more similar representations compared to augmentations of different inputs. Recent attempts to theoretically explain the success of contrastive learning on downstream classification tasks prove guarantees depending on properties of", "bibtex": "@InProceedings{pmlr-v162-saunshi22a,\n title = \t {Understanding Contrastive Learning Requires Incorporating Inductive Biases},\n author = {Saunshi, Nikunj and Ash, Jordan and Goel, Surbhi and Misra, Dipendra and Zhang, Cyril and Arora, Sanjeev and Kakade, Sham and Krishnamurthy, Akshay},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19250--19286},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/saunshi22a/saunshi22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/saunshi22a.html},\n abstract = \t {Contrastive learning is a popular form of self-supervised learning that encourages augmentations (views) of the same input to have more similar representations compared to augmentations of different inputs. Recent attempts to theoretically explain the success of contrastive learning on downstream classification tasks prove guarantees depending on properties of", "pdf": "https://proceedings.mlr.press/v162/saunshi22a/saunshi22a.pdf", "supp": "", "pdf_size": 2225290, "gs_citation": 141, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8429647646548426240&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Princeton University; Microsoft Research, New York City; Microsoft Research, New York City; Microsoft Research, New York City; Microsoft Research, New York City; Department of Computer Science, Princeton University; Departments of Computer Science & Statistics, Harvard University; Microsoft Research, New York City", "aff_domain": "cs.princeton.edu; ; ; ; ; ; ; ", "email": "cs.princeton.edu; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/saunshi22a.html", "aff_unique_index": "0;1;1;1;1;0;2;1", "aff_unique_norm": "Princeton University;Microsoft;Harvard University", "aff_unique_dep": "Department of Computer Science;Microsoft Research;Departments of Computer Science & Statistics", "aff_unique_url": "https://www.princeton.edu;https://www.microsoft.com/en-us/research;https://www.harvard.edu", "aff_unique_abbr": "Princeton;MSR;Harvard", "aff_campus_unique_index": "1;1;1;1;2;1", "aff_campus_unique": ";New York City;Cambridge", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding Dataset Difficulty with $\\mathcal{V}$-Usable Information", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16633", "id": "16633", "proceeding": "https://proceedings.mlr.press/v162/ethayarajh22a.html", "poster": "", "slides": "", "author_site": "Kawin Ethayarajh, Yejin Choi, Swabha Swayamdipta", "author": "Kawin Ethayarajh; Yejin Choi; Swabha Swayamdipta", "abstract": "Estimating the difficulty of a dataset typically involves comparing state-of-the-art models to humans; the bigger the performance gap, the harder the dataset is said to be. However, this comparison provides little understanding of how difficult each instance in a given distribution is, or what attributes make the dataset difficult for a given model. To address these questions, we frame dataset difficulty\u2014w.r.t. a model $\\mathcal{V}$\u2014as the lack of $\\mathcal{V}$-usable information (Xu et al., 2019), where a lower value indicates a more difficult dataset for $\\mathcal{V}$. We further introduce pointwise $\\mathcal{V}$-information (PVI) for measuring the difficulty of individual instances w.r.t. a given distribution. While standard evaluation metrics typically only compare different models for the same dataset, $\\mathcal{V}$-usable information and PVI also permit the converse: for a given model $\\mathcal{V}$, we can compare different datasets, as well as different instances/slices of the same dataset. Furthermore, our framework allows for the interpretability of different input attributes via transformations of the input, which we use to discover annotation artefacts in widely-used NLP benchmarks.", "bibtex": "@InProceedings{pmlr-v162-ethayarajh22a,\n title = \t {Understanding Dataset Difficulty with $\\mathcal{V}$-Usable Information},\n author = {Ethayarajh, Kawin and Choi, Yejin and Swayamdipta, Swabha},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5988--6008},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ethayarajh22a/ethayarajh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ethayarajh22a.html},\n abstract = \t {Estimating the difficulty of a dataset typically involves comparing state-of-the-art models to humans; the bigger the performance gap, the harder the dataset is said to be. However, this comparison provides little understanding of how difficult each instance in a given distribution is, or what attributes make the dataset difficult for a given model. To address these questions, we frame dataset difficulty\u2014w.r.t. a model $\\mathcal{V}$\u2014as the lack of $\\mathcal{V}$-usable information (Xu et al., 2019), where a lower value indicates a more difficult dataset for $\\mathcal{V}$. We further introduce pointwise $\\mathcal{V}$-information (PVI) for measuring the difficulty of individual instances w.r.t. a given distribution. While standard evaluation metrics typically only compare different models for the same dataset, $\\mathcal{V}$-usable information and PVI also permit the converse: for a given model $\\mathcal{V}$, we can compare different datasets, as well as different instances/slices of the same dataset. Furthermore, our framework allows for the interpretability of different input attributes via transformations of the input, which we use to discover annotation artefacts in widely-used NLP benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/ethayarajh22a/ethayarajh22a.pdf", "supp": "", "pdf_size": 711685, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff": "Stanford University; Allen Institute for Artificial Intelligence+Paul G. Allen School of Computer Science, University of Washington; Allen Institute for Artificial Intelligence", "aff_domain": "stanford.edu; ; ", "email": "stanford.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/ethayarajh22a.html", "aff_unique_index": "0;1+2;1", "aff_unique_norm": "Stanford University;Allen Institute for Artificial Intelligence;University of Washington", "aff_unique_dep": ";;Paul G. Allen School of Computer Science", "aff_unique_url": "https://www.stanford.edu;https://allenai.org;https://www.washington.edu", "aff_unique_abbr": "Stanford;AI2;UW", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Stanford;;Seattle", "aff_country_unique_index": "0;0+0;0", "aff_country_unique": "United States" }, { "title": "Understanding Doubly Stochastic Clustering", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17907", "id": "17907", "proceeding": "https://proceedings.mlr.press/v162/ding22a.html", "poster": "/media/PosterPDFs/ICML%202022/14f2ebeab937ca128186e7ba876faef9_a2e5sqa.png?t=1658351840.5438678", "slides": "/media/icml-2022/Slides/17907_3eipupS.pdf", "author_site": "Tianjiao Ding, Derek Lim, Rene Vidal, Benjamin Haeffele", "author": "Tianjiao Ding; Derek Lim; Rene Vidal; Benjamin D Haeffele", "abstract": "The problem of projecting a matrix onto the space of", "bibtex": "@InProceedings{pmlr-v162-ding22a,\n title = \t {Understanding Doubly Stochastic Clustering},\n author = {Ding, Tianjiao and Lim, Derek and Vidal, Rene and Haeffele, Benjamin D},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5153--5165},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ding22a/ding22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ding22a.html},\n abstract = \t {The problem of projecting a matrix onto the space of", "pdf": "https://proceedings.mlr.press/v162/ding22a/ding22a.pdf", "supp": "", "pdf_size": 2022475, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8239059328858100472&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/ding22a.html" }, { "title": "Understanding Gradient Descent on the Edge of Stability in Deep Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18395", "id": "18395", "proceeding": "https://proceedings.mlr.press/v162/arora22a.html", "poster": "/media/PosterPDFs/ICML%202022/fecf2c550171d3195c879d115440ae45.png?t=1657822443.180696", "slides": "/media/icml-2022/Slides/18395.pdf", "author_site": "Sanjeev Arora, Zhiyuan Li, Abhishek Panigrahi", "author": "Sanjeev Arora; Zhiyuan Li; Abhishek Panigrahi", "abstract": "Deep learning experiments by \\citet{cohen2021gradient} using deterministic Gradient Descent (GD) revealed an", "bibtex": "@InProceedings{pmlr-v162-arora22a,\n title = \t {Understanding Gradient Descent on the Edge of Stability in Deep Learning},\n author = {Arora, Sanjeev and Li, Zhiyuan and Panigrahi, Abhishek},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {948--1024},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/arora22a/arora22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/arora22a.html},\n abstract = \t {Deep learning experiments by \\citet{cohen2021gradient} using deterministic Gradient Descent (GD) revealed an", "pdf": "https://proceedings.mlr.press/v162/arora22a/arora22a.pdf", "supp": "", "pdf_size": 1730867, "gs_citation": 132, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9099753820789474638&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Princeton University; Princeton University; Princeton University", "aff_domain": "cs.princeton.edu;cs.princeton.edu;cs.princeton.edu", "email": "cs.princeton.edu;cs.princeton.edu;cs.princeton.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/arora22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding Gradual Domain Adaptation: Improved Analysis, Optimal Path and Beyond", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18363", "id": "18363", "proceeding": "https://proceedings.mlr.press/v162/wang22n.html", "poster": "/media/PosterPDFs/ICML%202022/bc573864331a9e42e4511de6f678aa83.png?t=1657902352.852823", "slides": "", "author_site": "Haoxiang Wang, Bo Li, Han Zhao", "author": "Haoxiang Wang; Bo Li; Han Zhao", "abstract": "The vast majority of existing algorithms for unsupervised domain adaptation (UDA) focus on adapting from a labeled source domain to an unlabeled target domain directly in a one-off way. Gradual domain adaptation (GDA), on the other hand, assumes a path of $(T-1)$ unlabeled intermediate domains bridging the source and target, and aims to provide better generalization in the target domain by leveraging the intermediate ones. Under certain assumptions, Kumar et al. (2020) proposed a simple algorithm, Gradual Self-Training, along with a generalization bound in the order of $e^{O(T)} \\left(\\varepsilon_0+O\\left(\\sqrt{log(T)/n}\\right)\\right)$ for the target domain error, where $\\varepsilon_0$ is the source domain error and $n$ is the data size of each domain. Due to the exponential factor, this upper bound becomes vacuous when $T$ is only moderately large. In this work, we analyze gradual self-training under more general and relaxed assumptions, and prove a significantly improved generalization bound as $\\widetilde{O}\\left(\\varepsilon_0 + T\\Delta + T/\\sqrt{n} + 1/\\sqrt{nT}\\right)$, where $\\Delta$ is the average distributional distance between consecutive domains. Compared with the existing bound with an exponential dependency on $T$ as a multiplicative factor, our bound only depends on $T$ linearly and additively. Perhaps more interestingly, our result implies the existence of an optimal choice of $T$ that minimizes the generalization error, and it also naturally suggests an optimal way to construct the path of intermediate domains so as to minimize the accumulative path length $T\\Delta$ between the source and target. To corroborate the implications of our theory, we examine gradual self-training on multiple semi-synthetic and real datasets, which confirms our findings. We believe our insights provide a path forward toward the design of future GDA algorithms.", "bibtex": "@InProceedings{pmlr-v162-wang22n,\n title = \t {Understanding Gradual Domain Adaptation: Improved Analysis, Optimal Path and Beyond},\n author = {Wang, Haoxiang and Li, Bo and Zhao, Han},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22784--22801},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22n/wang22n.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22n.html},\n abstract = \t {The vast majority of existing algorithms for unsupervised domain adaptation (UDA) focus on adapting from a labeled source domain to an unlabeled target domain directly in a one-off way. Gradual domain adaptation (GDA), on the other hand, assumes a path of $(T-1)$ unlabeled intermediate domains bridging the source and target, and aims to provide better generalization in the target domain by leveraging the intermediate ones. Under certain assumptions, Kumar et al. (2020) proposed a simple algorithm, Gradual Self-Training, along with a generalization bound in the order of $e^{O(T)} \\left(\\varepsilon_0+O\\left(\\sqrt{log(T)/n}\\right)\\right)$ for the target domain error, where $\\varepsilon_0$ is the source domain error and $n$ is the data size of each domain. Due to the exponential factor, this upper bound becomes vacuous when $T$ is only moderately large. In this work, we analyze gradual self-training under more general and relaxed assumptions, and prove a significantly improved generalization bound as $\\widetilde{O}\\left(\\varepsilon_0 + T\\Delta + T/\\sqrt{n} + 1/\\sqrt{nT}\\right)$, where $\\Delta$ is the average distributional distance between consecutive domains. Compared with the existing bound with an exponential dependency on $T$ as a multiplicative factor, our bound only depends on $T$ linearly and additively. Perhaps more interestingly, our result implies the existence of an optimal choice of $T$ that minimizes the generalization error, and it also naturally suggests an optimal way to construct the path of intermediate domains so as to minimize the accumulative path length $T\\Delta$ between the source and target. To corroborate the implications of our theory, we examine gradual self-training on multiple semi-synthetic and real datasets, which confirms our findings. We believe our insights provide a path forward toward the design of future GDA algorithms.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22n/wang22n.pdf", "supp": "", "pdf_size": 1456606, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8368642919883535588&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign", "aff_domain": "illinois.edu; ; ", "email": "illinois.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22n.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding Instance-Level Impact of Fairness Constraints", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17253", "id": "17253", "proceeding": "https://proceedings.mlr.press/v162/wang22ac.html", "poster": "/media/PosterPDFs/ICML%202022/9a3f263a5e5f63006098a05cd7491997.png?t=1658416036.593749", "slides": "", "author_site": "Jialu Wang, Xin Eric Wang, Yang Liu", "author": "Jialu Wang; Xin Eric Wang; Yang Liu", "abstract": "A variety of fairness constraints have been proposed in the literature to mitigate group-level statistical bias. Their impacts have been largely evaluated for different groups of populations corresponding to a set of sensitive attributes, such as race or gender. Nonetheless, the community has not observed sufficient explorations for how imposing fairness constraints fare at an instance level. Building on the concept of influence function, a measure that characterizes the impact of a training example on the target model and its predictive performance, this work studies the influence of training examples when fairness constraints are imposed. We find out that under certain assumptions, the influence function with respect to fairness constraints can be decomposed into a kernelized combination of training examples. One promising application of the proposed fairness influence function is to identify suspicious training examples that may cause model discrimination by ranking their influence scores. We demonstrate with extensive experiments that training on a subset of weighty data examples leads to lower fairness violations with a trade-off of accuracy.", "bibtex": "@InProceedings{pmlr-v162-wang22ac,\n title = \t {Understanding Instance-Level Impact of Fairness Constraints},\n author = {Wang, Jialu and Wang, Xin Eric and Liu, Yang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23114--23130},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22ac/wang22ac.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22ac.html},\n abstract = \t {A variety of fairness constraints have been proposed in the literature to mitigate group-level statistical bias. Their impacts have been largely evaluated for different groups of populations corresponding to a set of sensitive attributes, such as race or gender. Nonetheless, the community has not observed sufficient explorations for how imposing fairness constraints fare at an instance level. Building on the concept of influence function, a measure that characterizes the impact of a training example on the target model and its predictive performance, this work studies the influence of training examples when fairness constraints are imposed. We find out that under certain assumptions, the influence function with respect to fairness constraints can be decomposed into a kernelized combination of training examples. One promising application of the proposed fairness influence function is to identify suspicious training examples that may cause model discrimination by ranking their influence scores. We demonstrate with extensive experiments that training on a subset of weighty data examples leads to lower fairness violations with a trade-off of accuracy.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22ac/wang22ac.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wang22ac-supp.zip", "pdf_size": 612758, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3186856282017277340&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science and Engineering, University of California, Santa Cruz, CA, USA; Department of Computer Science and Engineering, University of California, Santa Cruz, CA, USA; Department of Computer Science and Engineering, University of California, Santa Cruz, CA, USA", "aff_domain": "ucsc.edu;ucsc.edu;ucsc.edu", "email": "ucsc.edu;ucsc.edu;ucsc.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22ac.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Santa Cruz", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.ucsc.edu", "aff_unique_abbr": "UCSC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Santa Cruz", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding Policy Gradient Algorithms: A Sensitivity-Based Approach", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16059", "id": "16059", "proceeding": "https://proceedings.mlr.press/v162/wu22i.html", "poster": "/media/PosterPDFs/ICML%202022/a532400ed62e772b9dc0b86f46e583ff_FLDpwoj.png?t=1657509163.4735997", "slides": "", "author_site": "Shuang Wu, Ling Shi, Jun Wang, Guangjian Tian", "author": "Shuang Wu; Ling Shi; Jun Wang; Guangjian Tian", "abstract": "The REINFORCE algorithm \\cite{williams1992simple} is popular in policy gradient (PG) for solving reinforcement learning (RL) problems. Meanwhile, the theoretical form of PG is from\u00a0\\cite{sutton1999policy}. Although both formulae prescribe PG, their precise connections are not yet illustrated. Recently, \\citeauthor{nota2020policy} (\\citeyear{nota2020policy}) have found that the ambiguity causes implementation errors. Motivated by the ambiguity and implementation incorrectness, we study PG from a perturbation perspective. In particular, we derive PG in a unified framework, precisely clarify the relation between PG implementation and theory, and echos back the findings by \\citeauthor{nota2020policy}. Diving into factors contributing to empirical successes of the existing erroneous implementations, we find that small approximation error and the experience replay mechanism play critical roles.", "bibtex": "@InProceedings{pmlr-v162-wu22i,\n title = \t {Understanding Policy Gradient Algorithms: A Sensitivity-Based Approach},\n author = {Wu, Shuang and Shi, Ling and Wang, Jun and Tian, Guangjian},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24131--24149},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22i/wu22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22i.html},\n abstract = \t {The REINFORCE algorithm \\cite{williams1992simple} is popular in policy gradient (PG) for solving reinforcement learning (RL) problems. Meanwhile, the theoretical form of PG is from\u00a0\\cite{sutton1999policy}. Although both formulae prescribe PG, their precise connections are not yet illustrated. Recently, \\citeauthor{nota2020policy} (\\citeyear{nota2020policy}) have found that the ambiguity causes implementation errors. Motivated by the ambiguity and implementation incorrectness, we study PG from a perturbation perspective. In particular, we derive PG in a unified framework, precisely clarify the relation between PG implementation and theory, and echos back the findings by \\citeauthor{nota2020policy}. Diving into factors contributing to empirical successes of the existing erroneous implementations, we find that small approximation error and the experience replay mechanism play critical roles.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22i/wu22i.pdf", "supp": "", "pdf_size": 703964, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4406309466358032878&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Huawei Noah\u2019s Ark Lab; Hong Kong University of Science and Technology; University College London; Huawei Noah\u2019s Ark Lab", "aff_domain": "huawei.com; ; ; ", "email": "huawei.com; ; ; ", "github": "", "project": "https://spinningup.openai.com/en/latest/index.html; https://www.mathworks.com/help/reinforcement-learning/agents.html", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/wu22i.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Huawei;Hong Kong University of Science and Technology;University College London", "aff_unique_dep": "Noah\u2019s Ark Lab;;", "aff_unique_url": "https://www.huawei.com;https://www.ust.hk;https://www.ucl.ac.uk", "aff_unique_abbr": "Huawei;HKUST;UCL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United Kingdom" }, { "title": "Understanding Robust Generalization in Learning Regular Languages", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16927", "id": "16927", "proceeding": "https://proceedings.mlr.press/v162/dan22a.html", "poster": "/media/PosterPDFs/ICML%202022/54f5f4071faca32ad5285fef87b78646_2kQPEiv.png?t=1657955654.8571477", "slides": "/media/icml-2022/Slides/16927.pdf", "author_site": "Soham Dan, Osbert Bastani, Dan Roth", "author": "Soham Dan; Osbert Bastani; Dan Roth", "abstract": "A key feature of human intelligence is the ability to generalize beyond the training distribution, for instance, parsing longer sentences than seen in the past. Currently, deep neural networks struggle to generalize robustly to such shifts in the data distribution. We study robust generalization in the context of using recurrent neural networks (RNNs) to learn regular languages. We hypothesize that standard end-to-end modeling strategies cannot generalize well to systematic distribution shifts and propose a compositional strategy to address this. We compare an end-to-end strategy that maps strings to labels with a compositional strategy that predicts the structure of the deterministic finite state automaton (DFA) that accepts the regular language. We theoretically prove that the compositional strategy generalizes significantly better than the end-to-end strategy. In our experiments, we implement the compositional strategy via an auxiliary task where the goal is to predict the intermediate states visited by the DFA when parsing a string. Our empirical results support our hypothesis, showing that auxiliary tasks can enable robust generalization. Interestingly, the end-to-end RNN generalizes significantly better than the theoretical lower bound, suggesting that it is able to achieve atleast some degree of robust generalization.", "bibtex": "@InProceedings{pmlr-v162-dan22a,\n title = \t {Understanding Robust Generalization in Learning Regular Languages},\n author = {Dan, Soham and Bastani, Osbert and Roth, Dan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4630--4643},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dan22a/dan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dan22a.html},\n abstract = \t {A key feature of human intelligence is the ability to generalize beyond the training distribution, for instance, parsing longer sentences than seen in the past. Currently, deep neural networks struggle to generalize robustly to such shifts in the data distribution. We study robust generalization in the context of using recurrent neural networks (RNNs) to learn regular languages. We hypothesize that standard end-to-end modeling strategies cannot generalize well to systematic distribution shifts and propose a compositional strategy to address this. We compare an end-to-end strategy that maps strings to labels with a compositional strategy that predicts the structure of the deterministic finite state automaton (DFA) that accepts the regular language. We theoretically prove that the compositional strategy generalizes significantly better than the end-to-end strategy. In our experiments, we implement the compositional strategy via an auxiliary task where the goal is to predict the intermediate states visited by the DFA when parsing a string. Our empirical results support our hypothesis, showing that auxiliary tasks can enable robust generalization. Interestingly, the end-to-end RNN generalizes significantly better than the theoretical lower bound, suggesting that it is able to achieve atleast some degree of robust generalization.}\n}", "pdf": "https://proceedings.mlr.press/v162/dan22a/dan22a.pdf", "supp": "", "pdf_size": 1479372, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15727983735914873026&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer and Information Science, University of Pennsylvania; Department of Computer and Information Science, University of Pennsylvania; Department of Computer and Information Science, University of Pennsylvania", "aff_domain": "seas.upenn.edu;seas.upenn.edu;seas.upenn.edu", "email": "seas.upenn.edu;seas.upenn.edu;seas.upenn.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/dan22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "Department of Computer and Information Science", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding Robust Overfitting of Adversarial Training and Beyond", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16153", "id": "16153", "proceeding": "https://proceedings.mlr.press/v162/yu22b.html", "poster": "/media/PosterPDFs/ICML%202022/dd45045f8c68db9f54e70c67048d32e8.png?t=1657185618.6385305", "slides": "", "author_site": "Chaojian Yu, Bo Han, Li Shen, Jun Yu, Chen Gong, Mingming Gong, Tongliang Liu", "author": "Chaojian Yu; Bo Han; Li Shen; Jun Yu; Chen Gong; Mingming Gong; Tongliang Liu", "abstract": "Robust overfitting widely exists in adversarial training of deep networks. The exact underlying reasons for this are still not completely understood. Here, we explore the causes of robust overfitting by comparing the data distribution of non-overfit (weak adversary) and overfitted (strong adversary) adversarial training, and observe that the distribution of the adversarial data generated by weak adversary mainly contain small-loss data. However, the adversarial data generated by strong adversary is more diversely distributed on the large-loss data and the small-loss data. Given these observations, we further designed data ablation adversarial training and identify that some small-loss data which are not worthy of the adversary strength cause robust overfitting in the strong adversary mode. To relieve this issue, we propose minimum loss constrained adversarial training (MLCAT): in a minibatch, we learn large-loss data as usual, and adopt additional measures to increase the loss of the small-loss data. Technically, MLCAT hinders data fitting when they become easy to learn to prevent robust overfitting; philosophically, MLCAT reflects the spirit of turning waste into treasure and making the best use of each adversarial data; algorithmically, we designed two realizations of MLCAT, and extensive experiments demonstrate that MLCAT can eliminate robust overfitting and further boost adversarial robustness.", "bibtex": "@InProceedings{pmlr-v162-yu22b,\n title = \t {Understanding Robust Overfitting of Adversarial Training and Beyond},\n author = {Yu, Chaojian and Han, Bo and Shen, Li and Yu, Jun and Gong, Chen and Gong, Mingming and Liu, Tongliang},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25595--25610},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yu22b/yu22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/yu22b.html},\n abstract = \t {Robust overfitting widely exists in adversarial training of deep networks. The exact underlying reasons for this are still not completely understood. Here, we explore the causes of robust overfitting by comparing the data distribution of non-overfit (weak adversary) and overfitted (strong adversary) adversarial training, and observe that the distribution of the adversarial data generated by weak adversary mainly contain small-loss data. However, the adversarial data generated by strong adversary is more diversely distributed on the large-loss data and the small-loss data. Given these observations, we further designed data ablation adversarial training and identify that some small-loss data which are not worthy of the adversary strength cause robust overfitting in the strong adversary mode. To relieve this issue, we propose minimum loss constrained adversarial training (MLCAT): in a minibatch, we learn large-loss data as usual, and adopt additional measures to increase the loss of the small-loss data. Technically, MLCAT hinders data fitting when they become easy to learn to prevent robust overfitting; philosophically, MLCAT reflects the spirit of turning waste into treasure and making the best use of each adversarial data; algorithmically, we designed two realizations of MLCAT, and extensive experiments demonstrate that MLCAT can eliminate robust overfitting and further boost adversarial robustness.}\n}", "pdf": "https://proceedings.mlr.press/v162/yu22b/yu22b.pdf", "supp": "", "pdf_size": 2368681, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4696544864566467358&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "aff": "TML Lab, Sydney AI Centre, The University of Sydney, Sydney, Australia; Department of Computer Science, Hong Kong Baptist University, Hong Kong, China; JD Explore Academy, Beijing, China; Department of Automation, University of Science and Technology of China, Hefei, China; School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China; School of Mathematics and Statistics, The University of Melbourne, Melbourne, Australia; TML Lab, Sydney AI Centre, The University of Sydney, Sydney, Australia", "aff_domain": "sydney.edu.au; ; ; ; ; ;sydney.edu.au", "email": "sydney.edu.au; ; ; ; ; ;sydney.edu.au", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/yu22b.html", "aff_unique_index": "0;1;2;3;4;5;0", "aff_unique_norm": "University of Sydney;Hong Kong Baptist University;JD;University of Science and Technology of China;Nanjing University of Science and Technology;University of Melbourne", "aff_unique_dep": "TML Lab;Department of Computer Science;JD Explore Academy;Department of Automation;School of Computer Science and Engineering;School of Mathematics and Statistics", "aff_unique_url": "https://www.sydney.edu.au;https://www.hkbu.edu.hk;;http://www.ustc.edu.cn;http://www.nust.edu.cn;https://www.unimelb.edu.au", "aff_unique_abbr": "USYD;HKBU;;USTC;NUST;UniMelb", "aff_campus_unique_index": "0;1;2;3;4;5;0", "aff_campus_unique": "Sydney;Hong Kong;Beijing;Hefei;Nanjing;Melbourne", "aff_country_unique_index": "0;1;1;1;1;0;0", "aff_country_unique": "Australia;China" }, { "title": "Understanding The Robustness in Vision Transformers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18403", "id": "18403", "proceeding": "https://proceedings.mlr.press/v162/zhou22m.html", "poster": "/media/PosterPDFs/ICML%202022/9246444d94f081e3549803b928260f56_mWcDb75.png?t=1658122323.948078", "slides": "", "author_site": "Zhou Daquan, Zhiding Yu, Enze Xie, Chaowei Xiao, Animashree Anandkumar, Jiashi Feng, Jose M. Alvarez", "author": "Daquan Zhou; Zhiding Yu; Enze Xie; Chaowei Xiao; Animashree Anandkumar; Jiashi Feng; Jose M. Alvarez", "abstract": "Recent studies show that Vision Transformers (ViTs) exhibit strong robustness against various corruptions. Although this property is partly attributed to the self-attention mechanism, there is still a lack of an explanatory framework towards a more systematic understanding. In this paper, we examine the role of self-attention in learning robust representations. Our study is motivated by the intriguing properties of self-attention in visual grouping which indicate that self-attention could promote improved mid-level representation and robustness. We thus propose a family of fully attentional networks (FANs) that incorporate self-attention in both token mixing and channel processing. We validate the design comprehensively on various hierarchical backbones. Our model with a DeiT architecture achieves a state-of-the-art 47.6% mCE on ImageNet-C with 29M parameters. We also demonstrate significantly improved robustness in two downstream tasks: semantic segmentation and object detection", "bibtex": "@InProceedings{pmlr-v162-zhou22m,\n title = \t {Understanding The Robustness in Vision Transformers},\n author = {Zhou, Daquan and Yu, Zhiding and Xie, Enze and Xiao, Chaowei and Anandkumar, Animashree and Feng, Jiashi and Alvarez, Jose M.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27378--27394},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22m/zhou22m.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22m.html},\n abstract = \t {Recent studies show that Vision Transformers (ViTs) exhibit strong robustness against various corruptions. Although this property is partly attributed to the self-attention mechanism, there is still a lack of an explanatory framework towards a more systematic understanding. In this paper, we examine the role of self-attention in learning robust representations. Our study is motivated by the intriguing properties of self-attention in visual grouping which indicate that self-attention could promote improved mid-level representation and robustness. We thus propose a family of fully attentional networks (FANs) that incorporate self-attention in both token mixing and channel processing. We validate the design comprehensively on various hierarchical backbones. Our model with a DeiT architecture achieves a state-of-the-art 47.6% mCE on ImageNet-C with 29M parameters. We also demonstrate significantly improved robustness in two downstream tasks: semantic segmentation and object detection}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22m/zhou22m.pdf", "supp": "", "pdf_size": 5139909, "gs_citation": 235, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3041067607452518927&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "National University of Singapore; NVIDIA; The University of Hong Kong; NVIDIA+ASU; NVIDIA+Caltech; ByteDance+NUS; NVIDIA", "aff_domain": "nvidia.com; ; ; ; ; ; ", "email": "nvidia.com; ; ; ; ; ; ", "github": "https://github.com/NVlabs/FAN", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/zhou22m.html", "aff_unique_index": "0;1;2;1+3;1+4;5+0;1", "aff_unique_norm": "National University of Singapore;NVIDIA;University of Hong Kong;Arizona State University;California Institute of Technology;ByteDance", "aff_unique_dep": ";NVIDIA Corporation;;;;", "aff_unique_url": "https://www.nus.edu.sg;https://www.nvidia.com;https://www.hku.hk;https://www.asu.edu;https://www.caltech.edu;https://www.bytedance.com", "aff_unique_abbr": "NUS;NVIDIA;HKU;ASU;Caltech;ByteDance", "aff_campus_unique_index": "1;;2;", "aff_campus_unique": ";Hong Kong SAR;Pasadena", "aff_country_unique_index": "0;1;2;1+1;1+1;2+0;1", "aff_country_unique": "Singapore;United States;China" }, { "title": "Understanding and Improving Knowledge Graph Embedding for Entity Alignment", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17603", "id": "17603", "proceeding": "https://proceedings.mlr.press/v162/guo22i.html", "poster": "/media/PosterPDFs/ICML%202022/ac34ae1fda29b8fe781ac8d6d32a6bc7.png?t=1657768608.5616114", "slides": "", "author_site": "Lingbing Guo, Qiang Zhang, Zequn Sun, Mingyang Chen, Wei Hu, Huajun Chen", "author": "Lingbing Guo; Qiang Zhang; Zequn Sun; Mingyang Chen; Wei Hu; Huajun Chen", "abstract": "Embedding-based entity alignment (EEA) has recently received great attention. Despite significant performance improvement, few efforts have been paid to facilitate understanding of EEA methods. Most existing studies rest on the assumption that a small number of pre-aligned entities can serve as anchors connecting the embedding spaces of two KGs. Nevertheless, no one has investigated the rationality of such an assumption. To fill the research gap, we define a typical paradigm abstracted from existing EEA methods and analyze how the embedding discrepancy between two potentially aligned entities is implicitly bounded by a predefined margin in the score function. Further, we find that such a bound cannot guarantee to be tight enough for alignment learning. We mitigate this problem by proposing a new approach, named NeoEA, to explicitly learn KG-invariant and principled entity embeddings. In this sense, an EEA model not only pursues the closeness of aligned entities based on geometric distance, but also aligns the neural ontologies of two KGs by eliminating the discrepancy in embedding distribution and underlying ontology knowledge. Our experiments demonstrate consistent and significant performance improvement against the best-performing EEA methods.", "bibtex": "@InProceedings{pmlr-v162-guo22i,\n title = \t {Understanding and Improving Knowledge Graph Embedding for Entity Alignment},\n author = {Guo, Lingbing and Zhang, Qiang and Sun, Zequn and Chen, Mingyang and Hu, Wei and Chen, Huajun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8145--8156},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/guo22i/guo22i.pdf},\n url = \t {https://proceedings.mlr.press/v162/guo22i.html},\n abstract = \t {Embedding-based entity alignment (EEA) has recently received great attention. Despite significant performance improvement, few efforts have been paid to facilitate understanding of EEA methods. Most existing studies rest on the assumption that a small number of pre-aligned entities can serve as anchors connecting the embedding spaces of two KGs. Nevertheless, no one has investigated the rationality of such an assumption. To fill the research gap, we define a typical paradigm abstracted from existing EEA methods and analyze how the embedding discrepancy between two potentially aligned entities is implicitly bounded by a predefined margin in the score function. Further, we find that such a bound cannot guarantee to be tight enough for alignment learning. We mitigate this problem by proposing a new approach, named NeoEA, to explicitly learn KG-invariant and principled entity embeddings. In this sense, an EEA model not only pursues the closeness of aligned entities based on geometric distance, but also aligns the neural ontologies of two KGs by eliminating the discrepancy in embedding distribution and underlying ontology knowledge. Our experiments demonstrate consistent and significant performance improvement against the best-performing EEA methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/guo22i/guo22i.pdf", "supp": "", "pdf_size": 1484876, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6512742248289986110&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "1College of Computer Science and Technology, Zhejiang University + 2ZJU-Hangzhou Global Scientific and Technological Innovation Center + 3Alibaba-Zhejiang University Joint Reseach Institute of Frontier Technologies; 1College of Computer Science and Technology, Zhejiang University + 2ZJU-Hangzhou Global Scientific and Technological Innovation Center + 3Alibaba-Zhejiang University Joint Reseach Institute of Frontier Technologies; 4State Key Laboratory for Novel Software Technology, Nanjing University, China; 1College of Computer Science and Technology, Zhejiang University + 3Alibaba-Zhejiang University Joint Reseach Institute of Frontier Technologies; 4State Key Laboratory for Novel Software Technology, Nanjing University, China; 1College of Computer Science and Technology, Zhejiang University + 2ZJU-Hangzhou Global Scientific and Technological Innovation Center + 3Alibaba-Zhejiang University Joint Reseach Institute of Frontier Technologies", "aff_domain": "zju.edu.cn;zju.edu.cn; ; ; ; ", "email": "zju.edu.cn;zju.edu.cn; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/guo22i.html", "aff_unique_index": "0+1+0;0+1+0;2;0+0;2;0+1+0", "aff_unique_norm": "Zhejiang University;Hangzhou Global Scientific and Technological Innovation Center;Nanjing University", "aff_unique_dep": "College of Computer Science and Technology;;State Key Laboratory for Novel Software Technology", "aff_unique_url": "http://www.zju.edu.cn;;http://www.nju.edu.cn", "aff_unique_abbr": "ZJU;;", "aff_campus_unique_index": "1;1;;1", "aff_campus_unique": ";Hangzhou", "aff_country_unique_index": "0+0+0;0+0+0;0;0+0;0;0+0+0", "aff_country_unique": "China" }, { "title": "Understanding the unstable convergence of gradient descent", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16685", "id": "16685", "proceeding": "https://proceedings.mlr.press/v162/ahn22a.html", "poster": "/media/PosterPDFs/ICML%202022/908c9a564a86426585b29f5335b619bc.png?t=1657204931.5198736", "slides": "", "author_site": "Kwangjun Ahn, Jingzhao Zhang, Suvrit Sra", "author": "Kwangjun Ahn; Jingzhao Zhang; Suvrit Sra", "abstract": "Most existing analyses of (stochastic) gradient descent rely on the condition that for $L$-smooth costs, the step size is less than $2/L$. However, many works have observed that in machine learning applications step sizes often do not fulfill this condition, yet (stochastic) gradient descent still converges, albeit in an unstable manner. We investigate this unstable convergence phenomenon from first principles, and discuss key causes behind it. We also identify its main characteristics, and how they interrelate based on both theory and experiments, offering a principled view toward understanding the phenomenon.", "bibtex": "@InProceedings{pmlr-v162-ahn22a,\n title = \t {Understanding the unstable convergence of gradient descent},\n author = {Ahn, Kwangjun and Zhang, Jingzhao and Sra, Suvrit},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {247--257},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ahn22a/ahn22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ahn22a.html},\n abstract = \t {Most existing analyses of (stochastic) gradient descent rely on the condition that for $L$-smooth costs, the step size is less than $2/L$. However, many works have observed that in machine learning applications step sizes often do not fulfill this condition, yet (stochastic) gradient descent still converges, albeit in an unstable manner. We investigate this unstable convergence phenomenon from first principles, and discuss key causes behind it. We also identify its main characteristics, and how they interrelate based on both theory and experiments, offering a principled view toward understanding the phenomenon.}\n}", "pdf": "https://proceedings.mlr.press/v162/ahn22a/ahn22a.pdf", "supp": "", "pdf_size": 1503099, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6310383193568445084&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of EECS, MIT, Cambridge, MA, USA + Simons Institute for the Theory of Computing, Berkeley, CA, USA; IIIS, Tsinghua University, Beijing, China; Department of EECS, MIT, Cambridge, MA, USA", "aff_domain": "mit.edu; ; ", "email": "mit.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/ahn22a.html", "aff_unique_index": "0+1;2;0", "aff_unique_norm": "Massachusetts Institute of Technology;Simons Institute for the Theory of Computing;Tsinghua University", "aff_unique_dep": "Department of Electrical Engineering and Computer Science;;IIIS", "aff_unique_url": "https://web.mit.edu;https://simons.berkeley.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "MIT;SITC;THU", "aff_campus_unique_index": "0+1;2;0", "aff_campus_unique": "Cambridge;Berkeley;Beijing", "aff_country_unique_index": "0+0;1;0", "aff_country_unique": "United States;China" }, { "title": "UniRank: Unimodal Bandit Algorithms for Online Ranking", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17713", "id": "17713", "proceeding": "https://proceedings.mlr.press/v162/gauthier22a.html", "poster": "/media/PosterPDFs/ICML%202022/a71378c0f8d76dbf90feeecd095d0ed9.png?t=1658231114.3333457", "slides": "", "author_site": "Camille-Sovanneary GAUTHIER, Romaric Gaudel, Elisa Fromont", "author": "Camille-Sovanneary Gauthier; Romaric Gaudel; Elisa Fromont", "abstract": "We tackle, in the multiple-play bandit setting, the online ranking problem of assigning L items to K predefined positions on a web page in order to maximize the number of user clicks. We propose a generic algorithm, UniRank, that tackles state-of-the-art click models. The regret bound of this algorithm is a direct consequence of the pseudo-unimodality property of the bandit setting with respect to a graph where nodes are ordered sets of indistinguishable items. The main contribution of UniRank is its O(L/$\\Delta$ logT) regret for T consecutive assignments, where $\\Delta$ relates to the reward-gap between two items. This regret bound is based on the usually implicit condition that two items may not have the same attractiveness. Experiments against state-of-the-art learning algorithms specialized or not for different click models, show that our method has better regret performance than other generic algorithms on real life and synthetic datasets.", "bibtex": "@InProceedings{pmlr-v162-gauthier22a,\n title = \t {{U}ni{R}ank: Unimodal Bandit Algorithms for Online Ranking},\n author = {Gauthier, Camille-Sovanneary and Gaudel, Romaric and Fromont, Elisa},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7279--7309},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gauthier22a/gauthier22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gauthier22a.html},\n abstract = \t {We tackle, in the multiple-play bandit setting, the online ranking problem of assigning L items to K predefined positions on a web page in order to maximize the number of user clicks. We propose a generic algorithm, UniRank, that tackles state-of-the-art click models. The regret bound of this algorithm is a direct consequence of the pseudo-unimodality property of the bandit setting with respect to a graph where nodes are ordered sets of indistinguishable items. The main contribution of UniRank is its O(L/$\\Delta$ logT) regret for T consecutive assignments, where $\\Delta$ relates to the reward-gap between two items. This regret bound is based on the usually implicit condition that two items may not have the same attractiveness. Experiments against state-of-the-art learning algorithms specialized or not for different click models, show that our method has better regret performance than other generic algorithms on real life and synthetic datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/gauthier22a/gauthier22a.pdf", "supp": "", "pdf_size": 3828843, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6163643713147920473&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Louis Vuitton, F-75001 Paris, France + IRISA UMR 6074 / INRIA rba, F-35000 Rennes, France; Univ Rennes, Ensai, CNRS, CREST - UMR 9194, F-35000 Rennes, France; Univ. Rennes 1, F-35000 Rennes, France + Institut Universitaire de France, M.E.S.R.I., F-75231 Paris", "aff_domain": "louisvuitton.com; ; ", "email": "louisvuitton.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/gauthier22a.html", "aff_unique_index": "0+1;2;3+4", "aff_unique_norm": "Louis Vuitton;INRIA;Universite Rennes;University Rennes 1;Institut Universitaire de France", "aff_unique_dep": ";UMR 6074;Ensai, CNRS, CREST - UMR 9194;;", "aff_unique_url": "https://www.louisvuitton.com;https://www.inria.fr;https://www.univ-rennes1.fr;https://www.univ-rennes1.fr;https://www.iuf.cnrs.fr", "aff_unique_abbr": ";INRIA;Univ Rennes;UR1;IUF", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Rennes", "aff_country_unique_index": "0+0;0;0+0", "aff_country_unique": "France" }, { "title": "Unified Fourier-based Kernel and Nonlinearity Design for Equivariant Networks on Homogeneous Spaces", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17197", "id": "17197", "proceeding": "https://proceedings.mlr.press/v162/xu22e.html", "poster": "/media/PosterPDFs/ICML%202022/7ef2f13f0e9d3478d7c36f6483d38a86.png?t=1657652413.42663", "slides": "", "author_site": "Yinshuang Xu, Jiahui Lei, Edgar Dobriban, Kostas Daniilidis", "author": "Yinshuang Xu; Jiahui Lei; Edgar Dobriban; Kostas Daniilidis", "abstract": "We introduce a unified framework for group equivariant networks on homogeneous spaces derived from a Fourier perspective. We consider tensor-valued feature fields, before and after a convolutional layer. We present a unified derivation of kernels via the Fourier domain by leveraging the sparsity of Fourier coefficients of the lifted feature fields. The sparsity emerges when the stabilizer subgroup of the homogeneous space is a compact Lie group. We further introduce a nonlinear activation, via an elementwise nonlinearity on the regular representation after lifting and projecting back to the field through an equivariant convolution. We show that other methods treating features as the Fourier coefficients in the stabilizer subgroup are special cases of our activation. Experiments on $SO(3)$ and $SE(3)$ show state-of-the-art performance in spherical vector field regression, point cloud classification, and molecular completion.", "bibtex": "@InProceedings{pmlr-v162-xu22e,\n title = \t {Unified {F}ourier-based Kernel and Nonlinearity Design for Equivariant Networks on Homogeneous Spaces},\n author = {Xu, Yinshuang and Lei, Jiahui and Dobriban, Edgar and Daniilidis, Kostas},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24596--24614},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/xu22e/xu22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/xu22e.html},\n abstract = \t {We introduce a unified framework for group equivariant networks on homogeneous spaces derived from a Fourier perspective. We consider tensor-valued feature fields, before and after a convolutional layer. We present a unified derivation of kernels via the Fourier domain by leveraging the sparsity of Fourier coefficients of the lifted feature fields. The sparsity emerges when the stabilizer subgroup of the homogeneous space is a compact Lie group. We further introduce a nonlinear activation, via an elementwise nonlinearity on the regular representation after lifting and projecting back to the field through an equivariant convolution. We show that other methods treating features as the Fourier coefficients in the stabilizer subgroup are special cases of our activation. Experiments on $SO(3)$ and $SE(3)$ show state-of-the-art performance in spherical vector field regression, point cloud classification, and molecular completion.}\n}", "pdf": "https://proceedings.mlr.press/v162/xu22e/xu22e.pdf", "supp": "", "pdf_size": 2052526, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5834314388136668618&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer and Information Science, University of Pennsylvania; Department of Computer and Information Science, University of Pennsylvania; Department of Statistics and Data Science, University of Pennsylvania + Department of Computer and Information Science, University of Pennsylvania; Department of Computer and Information Science, University of Pennsylvania", "aff_domain": "seas.upenn.edu;seas.upenn.edu;wharton.upenn.edu;cis.upenn.edu", "email": "seas.upenn.edu;seas.upenn.edu;wharton.upenn.edu;cis.upenn.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/xu22e.html", "aff_unique_index": "0;0;0+0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "Department of Computer and Information Science", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0", "aff_country_unique": "United States" }, { "title": "Unified Scaling Laws for Routed Language Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17819", "id": "17819", "proceeding": "https://proceedings.mlr.press/v162/clark22a.html", "poster": "/media/PosterPDFs/ICML%202022/ddac1f6f13bb372a177804adcd3b8a31.png?t=1658068930.2733161", "slides": "", "author_site": "Aidan Clark, Diego de Las Casas, Aurelia Guy, Arthur Mensch, Michela Paganini, Jordan Hoffmann, Bogdan Damoc, Blake Hechtman, Trevor Cai, Sebastian Borgeaud, George van den Driessche, Eliza Rutherford, Tom Hennigan, Matthew Johnson, Albin Cassirer, Chris Jones, Elena Buchatskaya, David Budden, Laurent Sifre, Simon Osindero, Oriol Vinyals, Marc'Aurelio Ranzato, Jack Rae, Erich Elsen, Koray Kavukcuoglu, Karen Simonyan", "author": "Aidan Clark; Diego De Las Casas; Aurelia Guy; Arthur Mensch; Michela Paganini; Jordan Hoffmann; Bogdan Damoc; Blake Hechtman; Trevor Cai; Sebastian Borgeaud; George Bm Van Den Driessche; Eliza Rutherford; Tom Hennigan; Matthew J Johnson; Albin Cassirer; Chris Jones; Elena Buchatskaya; David Budden; Laurent Sifre; Simon Osindero; Oriol Vinyals; Marc\u2019Aurelio Ranzato; Jack Rae; Erich Elsen; Koray Kavukcuoglu; Karen Simonyan", "abstract": "The performance of a language model has been shown to be effectively modeled as a power-law in its parameter count. Here we study the scaling behaviors of Routing Networks: architectures that conditionally use only a subset of their parameters while processing an input. For these models, parameter count and computational requirement form two independent axes along which an increase leads to better performance. In this work we derive and justify scaling laws defined on these two variables which generalize those known for standard language models and describe the performance of a wide range of routing architectures trained via three different techniques. Afterwards we provide two applications of these laws: first deriving an Effective Parameter Count along which all models scale at the same rate, and then using the scaling coefficients to give a quantitative comparison of the three routing techniques considered. Our analysis derives from an extensive evaluation of Routing Networks across five orders of magnitude of size, including models with hundreds of experts and hundreds of billions of parameters.", "bibtex": "@InProceedings{pmlr-v162-clark22a,\n title = \t {Unified Scaling Laws for Routed Language Models},\n author = {Clark, Aidan and De Las Casas, Diego and Guy, Aurelia and Mensch, Arthur and Paganini, Michela and Hoffmann, Jordan and Damoc, Bogdan and Hechtman, Blake and Cai, Trevor and Borgeaud, Sebastian and Van Den Driessche, George Bm and Rutherford, Eliza and Hennigan, Tom and Johnson, Matthew J and Cassirer, Albin and Jones, Chris and Buchatskaya, Elena and Budden, David and Sifre, Laurent and Osindero, Simon and Vinyals, Oriol and Ranzato, Marc'Aurelio and Rae, Jack and Elsen, Erich and Kavukcuoglu, Koray and Simonyan, Karen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4057--4086},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/clark22a/clark22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/clark22a.html},\n abstract = \t {The performance of a language model has been shown to be effectively modeled as a power-law in its parameter count. Here we study the scaling behaviors of Routing Networks: architectures that conditionally use only a subset of their parameters while processing an input. For these models, parameter count and computational requirement form two independent axes along which an increase leads to better performance. In this work we derive and justify scaling laws defined on these two variables which generalize those known for standard language models and describe the performance of a wide range of routing architectures trained via three different techniques. Afterwards we provide two applications of these laws: first deriving an Effective Parameter Count along which all models scale at the same rate, and then using the scaling coefficients to give a quantitative comparison of the three routing techniques considered. Our analysis derives from an extensive evaluation of Routing Networks across five orders of magnitude of size, including models with hundreds of experts and hundreds of billions of parameters.}\n}", "pdf": "https://proceedings.mlr.press/v162/clark22a/clark22a.pdf", "supp": "", "pdf_size": 1537938, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9764357455664170448&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": ";;;;;;;;;;;;;;;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;;;;;;;;;;;;;;", "email": ";;;;;;;;;;;;;;;;;;;;;;;;;", "github": "", "project": "", "author_num": 26, "oa": "https://proceedings.mlr.press/v162/clark22a.html" }, { "title": "Universal Hopfield Networks: A General Framework for Single-Shot Associative Memory Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17307", "id": "17307", "proceeding": "https://proceedings.mlr.press/v162/millidge22a.html", "poster": "/media/PosterPDFs/ICML%202022/8a88d5f412f2ad376f8597d28cbd3720.png?t=1657724148.0570376", "slides": "", "author_site": "Beren Millidge, Tommaso Salvatori, Yuhang Song, Thomas Lukasiewicz, Rafal Bogacz", "author": "Beren Millidge; Tommaso Salvatori; Yuhang Song; Thomas Lukasiewicz; Rafal Bogacz", "abstract": "A large number of neural network models of associative memory have been proposed in the literature. These include the classical Hopfield networks (HNs), sparse distributed memories (SDMs), and more recently the modern continuous Hopfield networks (MCHNs), which possess close links with self-attention in machine learning. In this paper, we propose a general framework for understanding the operation of such memory networks as a sequence of three operations: similarity, separation, and projection. We derive all these memory models as instances of our general framework with differing similarity and separation functions. We extend the mathematical framework of Krotov et al (2020) to express general associative memory models using neural network dynamics with local computation, and derive a general energy function that is a Lyapunov function of the dynamics. Finally, using our framework, we empirically investigate the capacity of using different similarity functions for these associative memory models, beyond the dot product similarity measure, and demonstrate empirically that Euclidean or Manhattan distance similarity metrics perform substantially better in practice on many tasks, enabling a more robust retrieval and higher memory capacity than existing\u00a0models.", "bibtex": "@InProceedings{pmlr-v162-millidge22a,\n title = \t {Universal Hopfield Networks: A General Framework for Single-Shot Associative Memory Models},\n author = {Millidge, Beren and Salvatori, Tommaso and Song, Yuhang and Lukasiewicz, Thomas and Bogacz, Rafal},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15561--15583},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/millidge22a/millidge22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/millidge22a.html},\n abstract = \t {A large number of neural network models of associative memory have been proposed in the literature. These include the classical Hopfield networks (HNs), sparse distributed memories (SDMs), and more recently the modern continuous Hopfield networks (MCHNs), which possess close links with self-attention in machine learning. In this paper, we propose a general framework for understanding the operation of such memory networks as a sequence of three operations: similarity, separation, and projection. We derive all these memory models as instances of our general framework with differing similarity and separation functions. We extend the mathematical framework of Krotov et al (2020) to express general associative memory models using neural network dynamics with local computation, and derive a general energy function that is a Lyapunov function of the dynamics. Finally, using our framework, we empirically investigate the capacity of using different similarity functions for these associative memory models, beyond the dot product similarity measure, and demonstrate empirically that Euclidean or Manhattan distance similarity metrics perform substantially better in practice on many tasks, enabling a more robust retrieval and higher memory capacity than existing\u00a0models.}\n}", "pdf": "https://proceedings.mlr.press/v162/millidge22a/millidge22a.pdf", "supp": "", "pdf_size": 3076997, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11661827262437868518&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "MRC Brain Network Dynamics Unit, University of Oxford, UK; Department of Computer Science, University of Oxford, UK; MRC Brain Network Dynamics Unit, University of Oxford, UK + Department of Computer Science, University of Oxford, UK; Institute of Logic and Computation, TU Wien, Austria + Department of Computer Science, University of Oxford, UK; MRC Brain Network Dynamics Unit, University of Oxford, UK", "aff_domain": "some.ox.ac.uk;some.ox.ac.uk;some.ox.ac.uk;some.tuwien.ac.at;some.ox.ac.uk", "email": "some.ox.ac.uk;some.ox.ac.uk;some.ox.ac.uk;some.tuwien.ac.at;some.ox.ac.uk", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/millidge22a.html", "aff_unique_index": "0;0;0+0;1+0;0", "aff_unique_norm": "University of Oxford;TU Wien", "aff_unique_dep": "MRC Brain Network Dynamics Unit;Institute of Logic and Computation", "aff_unique_url": "https://www.ox.ac.uk;https://www.tuwien.ac.at", "aff_unique_abbr": "Oxford;TU Wien", "aff_campus_unique_index": "0;0;;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;0;0+0;1+0;0", "aff_country_unique": "United Kingdom;Austria" }, { "title": "Universal Joint Approximation of Manifolds and Densities by Simple Injective Flows", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16427", "id": "16427", "proceeding": "https://proceedings.mlr.press/v162/puthawala22a.html", "poster": "/media/PosterPDFs/ICML%202022/87475f2c1909e4e6d0d7f0e020a2ded3.png?t=1657728184.14422", "slides": "/media/icml-2022/Slides/16427.pdf", "author_site": "Michael Puthawala, Matti Lassas, Ivan Dokmanic, Maarten de Hoop", "author": "Michael Puthawala; Matti Lassas; Ivan Dokmanic; Maarten De Hoop", "abstract": "We study approximation of probability measures supported on n-dimensional manifolds embedded in R^m by injective flows\u2014neural networks composed of invertible flows and injective layers. We show that in general, injective flows between R^n and R^m universally approximate measures supported on images of extendable embeddings, which are a subset of standard embeddings: when the embedding dimension m is small, topological obstructions may preclude certain manifolds as admissible targets. When the embedding dimension is sufficiently large, m >= 3n+1, we use an argument from algebraic topology known as the clean trick to prove that the topological obstructions vanish and injective flows universally approximate any differentiable embedding. Along the way we show that the studied injective flows admit efficient projections on the range, and that their optimality can be established \"in reverse,\" resolving a conjecture made in Brehmer & Cranmer 2020.", "bibtex": "@InProceedings{pmlr-v162-puthawala22a,\n title = \t {Universal Joint Approximation of Manifolds and Densities by Simple Injective Flows},\n author = {Puthawala, Michael and Lassas, Matti and Dokmanic, Ivan and De Hoop, Maarten},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17959--17983},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/puthawala22a/puthawala22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/puthawala22a.html},\n abstract = \t {We study approximation of probability measures supported on n-dimensional manifolds embedded in R^m by injective flows\u2014neural networks composed of invertible flows and injective layers. We show that in general, injective flows between R^n and R^m universally approximate measures supported on images of extendable embeddings, which are a subset of standard embeddings: when the embedding dimension m is small, topological obstructions may preclude certain manifolds as admissible targets. When the embedding dimension is sufficiently large, m >= 3n+1, we use an argument from algebraic topology known as the clean trick to prove that the topological obstructions vanish and injective flows universally approximate any differentiable embedding. Along the way we show that the studied injective flows admit efficient projections on the range, and that their optimality can be established \"in reverse,\" resolving a conjecture made in Brehmer & Cranmer 2020.}\n}", "pdf": "https://proceedings.mlr.press/v162/puthawala22a/puthawala22a.pdf", "supp": "", "pdf_size": 633088, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9340911345063794275&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/puthawala22a.html" }, { "title": "Universal and data-adaptive algorithms for model selection in linear contextual bandits", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16805", "id": "16805", "proceeding": "https://proceedings.mlr.press/v162/muthukumar22a.html", "poster": "", "slides": "", "author_site": "Vidya Muthukumar, Akshay Krishnamurthy", "author": "Vidya K Muthukumar; Akshay Krishnamurthy", "abstract": "Model selection in contextual bandits is an important complementary problem to regret minimization with respect to a fixed model class. We consider the simplest non-trivial instance of model-selection: distinguishing a simple multi-armed bandit problem from a linear contextual bandit problem. Even in this instance, current state-of-the-art methods explore in a suboptimal manner and require strong \"feature-diversity\" conditions. In this paper, we introduce new algorithms that a) explore in a data-adaptive manner, and b) provide model selection guarantees of the form O(d^{\\alpha} T^{1 - \\alpha}) with no feature diversity conditions whatsoever, where d denotes the dimension of the linear model and T denotes the total number of rounds. The first algorithm enjoys a \"best-of-both-worlds\" property, recovering two prior results that hold under distinct distributional assumptions, simultaneously. The second removes distributional assumptions altogether, expanding the scope for tractable model selection. Our approach extends to model selection among nested linear contextual bandits under some additional assumptions.", "bibtex": "@InProceedings{pmlr-v162-muthukumar22a,\n title = \t {Universal and data-adaptive algorithms for model selection in linear contextual bandits},\n author = {Muthukumar, Vidya K and Krishnamurthy, Akshay},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16197--16222},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/muthukumar22a/muthukumar22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/muthukumar22a.html},\n abstract = \t {Model selection in contextual bandits is an important complementary problem to regret minimization with respect to a fixed model class. We consider the simplest non-trivial instance of model-selection: distinguishing a simple multi-armed bandit problem from a linear contextual bandit problem. Even in this instance, current state-of-the-art methods explore in a suboptimal manner and require strong \"feature-diversity\" conditions. In this paper, we introduce new algorithms that a) explore in a data-adaptive manner, and b) provide model selection guarantees of the form O(d^{\\alpha} T^{1 - \\alpha}) with no feature diversity conditions whatsoever, where d denotes the dimension of the linear model and T denotes the total number of rounds. The first algorithm enjoys a \"best-of-both-worlds\" property, recovering two prior results that hold under distinct distributional assumptions, simultaneously. The second removes distributional assumptions altogether, expanding the scope for tractable model selection. Our approach extends to model selection among nested linear contextual bandits under some additional assumptions.}\n}", "pdf": "https://proceedings.mlr.press/v162/muthukumar22a/muthukumar22a.pdf", "supp": "", "pdf_size": 477787, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3607687191007609479&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "School of Electrical and Computer Engineering, Georgia Institute of Technology, Atlanta, USA+H. Milton School of Industrial and Systems Engineering, Georgia Institute of Technology, Atlanta, USA; Microsoft Research, New York City, USA", "aff_domain": "gatech.edu; ", "email": "gatech.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/muthukumar22a.html", "aff_unique_index": "0+0;1", "aff_unique_norm": "Georgia Institute of Technology;Microsoft", "aff_unique_dep": "School of Electrical and Computer Engineering;Microsoft Research", "aff_unique_url": "https://www.gatech.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Georgia Tech;MSR", "aff_campus_unique_index": "0+0;1", "aff_campus_unique": "Atlanta;New York City", "aff_country_unique_index": "0+0;0", "aff_country_unique": "United States" }, { "title": "Universality of Winning Tickets: A Renormalization Group Perspective", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16283", "id": "16283", "proceeding": "https://proceedings.mlr.press/v162/redman22a.html", "poster": "/media/PosterPDFs/ICML%202022/1714726c817af50457d810aae9d27a2e.png?t=1657234004.9866846", "slides": "", "author_site": "William T. Redman, Tianlong Chen, Zhangyang \u201cAtlas\u201d Wang, Akshunna S. Dogra", "author": "William T Redman; Tianlong Chen; Zhangyang Wang; Akshunna S. Dogra", "abstract": "Foundational work on the Lottery Ticket Hypothesis has suggested an exciting corollary: winning tickets found in the context of one task can be transferred to similar tasks, possibly even across different architectures. This has generated broad interest, but methods to study this universality are lacking. We make use of renormalization group theory, a powerful tool from theoretical physics, to address this need. We find that iterative magnitude pruning, the principal algorithm used for discovering winning tickets, is a renormalization group scheme, and can be viewed as inducing a flow in parameter space. We demonstrate that ResNet-50 models with transferable winning tickets have flows with common properties, as would be expected from the theory. Similar observations are made for BERT models, with evidence that their flows are near fixed points. Additionally, we leverage our framework to study winning tickets transferred across ResNet architectures, observing that smaller models have flows with more uniform properties than larger models, complicating transfer between them.", "bibtex": "@InProceedings{pmlr-v162-redman22a,\n title = \t {Universality of Winning Tickets: A Renormalization Group Perspective},\n author = {Redman, William T and Chen, Tianlong and Wang, Zhangyang and Dogra, Akshunna S.},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18483--18498},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/redman22a/redman22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/redman22a.html},\n abstract = \t {Foundational work on the Lottery Ticket Hypothesis has suggested an exciting corollary: winning tickets found in the context of one task can be transferred to similar tasks, possibly even across different architectures. This has generated broad interest, but methods to study this universality are lacking. We make use of renormalization group theory, a powerful tool from theoretical physics, to address this need. We find that iterative magnitude pruning, the principal algorithm used for discovering winning tickets, is a renormalization group scheme, and can be viewed as inducing a flow in parameter space. We demonstrate that ResNet-50 models with transferable winning tickets have flows with common properties, as would be expected from the theory. Similar observations are made for BERT models, with evidence that their flows are near fixed points. Additionally, we leverage our framework to study winning tickets transferred across ResNet architectures, observing that smaller models have flows with more uniform properties than larger models, complicating transfer between them.}\n}", "pdf": "https://proceedings.mlr.press/v162/redman22a/redman22a.pdf", "supp": "", "pdf_size": 1317988, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17857065200684059331&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Interdepartmental Graduate Program in Dynamical Neuroscience, University of California, Santa Barbara; Department of Electrical and Computer Engineering, University of Texas at Austin; Department of Electrical and Computer Engineering, University of Texas at Austin; Department of Mathematics, Imperial College London + EPSRC CDT in Mathematics of Random Systems: Analysis, Modelling and Simulation", "aff_domain": "ucsb.edu; ; ; ", "email": "ucsb.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/redman22a.html", "aff_unique_index": "0;1;1;2+3", "aff_unique_norm": "University of California, Santa Barbara;University of Texas at Austin;Imperial College London;EPSRC", "aff_unique_dep": "Interdepartmental Graduate Program in Dynamical Neuroscience;Department of Electrical and Computer Engineering;Department of Mathematics;Mathematics of Random Systems: Analysis, Modelling and Simulation", "aff_unique_url": "https://www.ucsb.edu;https://www.utexas.edu;https://www.imperial.ac.uk;https://www.epsrc.ukri.org/", "aff_unique_abbr": "UCSB;UT Austin;Imperial;EPSRC", "aff_campus_unique_index": "0;1;1;2", "aff_campus_unique": "Santa Barbara;Austin;London;", "aff_country_unique_index": "0;0;0;1+1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Unraveling Attention via Convex Duality: Analysis and Interpretations of Vision Transformers", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17171", "id": "17171", "proceeding": "https://proceedings.mlr.press/v162/sahiner22a.html", "poster": "/media/PosterPDFs/ICML%202022/c7217b04fe11f374f9a6737901025606_oG73OLQ.png?t=1657735570.6996105", "slides": "", "author_site": "Arda Sahiner, Tolga Ergen, Batu M Ozturkler, John Pauly, Morteza Mardani, Mert Pilanci", "author": "Arda Sahiner; Tolga Ergen; Batu Ozturkler; John Pauly; Morteza Mardani; Mert Pilanci", "abstract": "Vision transformers using self-attention or its proposed alternatives have demonstrated promising results in many image related tasks. However, the underpinning inductive bias of attention is not well understood. To address this issue, this paper analyzes attention through the lens of convex duality. For the non-linear dot-product self-attention, and alternative mechanisms such as MLP-mixer and Fourier Neural Operator (FNO), we derive equivalent finite-dimensional convex problems that are interpretable and solvable to global optimality. The convex programs lead to block nuclear-norm regularization that promotes low rank in the latent feature and token dimensions. In particular, we show how self-attention networks implicitly clusters the tokens, based on their latent similarity. We conduct experiments for transferring a pre-trained transformer backbone for CIFAR-100 classification by fine-tuning a variety of convex attention heads. The results indicate the merits of the bias induced by attention compared with the existing MLP or linear heads.", "bibtex": "@InProceedings{pmlr-v162-sahiner22a,\n title = \t {Unraveling Attention via Convex Duality: Analysis and Interpretations of Vision Transformers},\n author = {Sahiner, Arda and Ergen, Tolga and Ozturkler, Batu and Pauly, John and Mardani, Morteza and Pilanci, Mert},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19050--19088},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/sahiner22a/sahiner22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/sahiner22a.html},\n abstract = \t {Vision transformers using self-attention or its proposed alternatives have demonstrated promising results in many image related tasks. However, the underpinning inductive bias of attention is not well understood. To address this issue, this paper analyzes attention through the lens of convex duality. For the non-linear dot-product self-attention, and alternative mechanisms such as MLP-mixer and Fourier Neural Operator (FNO), we derive equivalent finite-dimensional convex problems that are interpretable and solvable to global optimality. The convex programs lead to block nuclear-norm regularization that promotes low rank in the latent feature and token dimensions. In particular, we show how self-attention networks implicitly clusters the tokens, based on their latent similarity. We conduct experiments for transferring a pre-trained transformer backbone for CIFAR-100 classification by fine-tuning a variety of convex attention heads. The results indicate the merits of the bias induced by attention compared with the existing MLP or linear heads.}\n}", "pdf": "https://proceedings.mlr.press/v162/sahiner22a/sahiner22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/sahiner22a-supp.zip", "pdf_size": 899225, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10895922445347839520&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Electrical Engineering, Stanford University; Department of Electrical Engineering, Stanford University; Department of Electrical Engineering, Stanford University; Department of Electrical Engineering, Stanford University; NVIDIA Corporation; Department of Electrical Engineering, Stanford University", "aff_domain": "stanford.edu; ; ; ; ; ", "email": "stanford.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/sahiner22a.html", "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Stanford University;NVIDIA", "aff_unique_dep": "Department of Electrical Engineering;NVIDIA Corporation", "aff_unique_url": "https://www.stanford.edu;https://www.nvidia.com", "aff_unique_abbr": "Stanford;NVIDIA", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Unsupervised Detection of Contextualized Embedding Bias with Application to Ideology", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18023", "id": "18023", "proceeding": "https://proceedings.mlr.press/v162/hofmann22a.html", "poster": "/media/PosterPDFs/ICML%202022/5f8a7deb15235a128fcd99ad6bfde11e_QzZW2tz.png?t=1658736231.5232434", "slides": "", "author_site": "Valentin Hofmann, Janet Pierrehumbert, Hinrich Sch\u00fctze", "author": "Valentin Hofmann; Janet Pierrehumbert; Hinrich Sch\u00fctze", "abstract": "We propose a fully unsupervised method to detect bias in contextualized embeddings. The method leverages the assortative information latently encoded by social networks and combines orthogonality regularization, structured sparsity learning, and graph neural networks to find the embedding subspace capturing this information. As a concrete example, we focus on the phenomenon of ideological bias: we introduce the concept of an ideological subspace, show how it can be found by applying our method to online discussion forums, and present techniques to probe it. Our experiments suggest that the ideological subspace encodes abstract evaluative semantics and reflects changes in the political left-right spectrum during the presidency of Donald Trump.", "bibtex": "@InProceedings{pmlr-v162-hofmann22a,\n title = \t {Unsupervised Detection of Contextualized Embedding Bias with Application to Ideology},\n author = {Hofmann, Valentin and Pierrehumbert, Janet and Sch{\\\"u}tze, Hinrich},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8796--8810},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hofmann22a/hofmann22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hofmann22a.html},\n abstract = \t {We propose a fully unsupervised method to detect bias in contextualized embeddings. The method leverages the assortative information latently encoded by social networks and combines orthogonality regularization, structured sparsity learning, and graph neural networks to find the embedding subspace capturing this information. As a concrete example, we focus on the phenomenon of ideological bias: we introduce the concept of an ideological subspace, show how it can be found by applying our method to online discussion forums, and present techniques to probe it. Our experiments suggest that the ideological subspace encodes abstract evaluative semantics and reflects changes in the political left-right spectrum during the presidency of Donald Trump.}\n}", "pdf": "https://proceedings.mlr.press/v162/hofmann22a/hofmann22a.pdf", "supp": "", "pdf_size": 32972571, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZmTTboN8tJsJ:scholar.google.com/&scioq=Unsupervised+Detection+of+Contextualized+Embedding+Bias+with+Application+to+Ideology&hl=en&as_sdt=0,33", "gs_version_total": 6, "aff": "Faculty of Linguistics, University of Oxford + Center for Information and Language Processing, LMU Munich; Department of Engineering Science, University of Oxford; Center for Information and Language Processing, LMU Munich", "aff_domain": "ling-phil.ox.ac.uk; ; ", "email": "ling-phil.ox.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/hofmann22a.html", "aff_unique_index": "0+1;0;1", "aff_unique_norm": "University of Oxford;LMU Munich", "aff_unique_dep": "Faculty of Linguistics;Center for Information and Language Processing", "aff_unique_url": "https://www.ox.ac.uk;https://www.lmu.de", "aff_unique_abbr": "Oxford;LMU", "aff_campus_unique_index": "0+1;0;1", "aff_campus_unique": "Oxford;Munich", "aff_country_unique_index": "0+1;0;1", "aff_country_unique": "United Kingdom;Germany" }, { "title": "Unsupervised Flow-Aligned Sequence-to-Sequence Learning for Video Restoration", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16143", "id": "16143", "proceeding": "https://proceedings.mlr.press/v162/lin22d.html", "poster": "/media/PosterPDFs/ICML%202022/6d70cb65d15211726dcce4c0e971e21c_54IRo89.png?t=1657246034.1944535", "slides": "/media/icml-2022/Slides/16143.pdf", "author_site": "Jing Lin, Xiaowan Hu, Yuanhao Cai, Haoqian Wang, Youliang Yan, Xueyi Zou, Yulun Zhang, Luc Van Gool", "author": "Jing Lin; Xiaowan Hu; Yuanhao Cai; Haoqian Wang; Youliang Yan; Xueyi Zou; Yulun Zhang; Luc Van Gool", "abstract": "How to properly model the inter-frame relation within the video sequence is an important but unsolved challenge for video restoration (VR). In this work, we propose an unsupervised flow-aligned sequence-to-sequence model (S2SVR) to address this problem. On the one hand, the sequence-to-sequence model, which has proven capable of sequence modeling in the field of natural language processing, is explored for the first time in VR. Optimized serialization modeling shows potential in capturing long-range dependencies among frames. On the other hand, we equip the sequence-to-sequence model with an unsupervised optical flow estimator to maximize its potential. The flow estimator is trained with our proposed unsupervised distillation loss, which can alleviate the data discrepancy and inaccurate degraded optical flow issues of previous flow-based methods. With reliable optical flow, we can establish accurate correspondence among multiple frames, narrowing the domain difference between 1D language and 2D misaligned frames and improving the potential of the sequence-to-sequence model. S2SVR shows superior performance in multiple VR tasks, including video deblurring, video super-resolution, and compressed video quality enhancement. https://github.com/linjing7/VR-Baseline", "bibtex": "@InProceedings{pmlr-v162-lin22d,\n title = \t {Unsupervised Flow-Aligned Sequence-to-Sequence Learning for Video Restoration},\n author = {Lin, Jing and Hu, Xiaowan and Cai, Yuanhao and Wang, Haoqian and Yan, Youliang and Zou, Xueyi and Zhang, Yulun and Van Gool, Luc},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13394--13404},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lin22d/lin22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/lin22d.html},\n abstract = \t {How to properly model the inter-frame relation within the video sequence is an important but unsolved challenge for video restoration (VR). In this work, we propose an unsupervised flow-aligned sequence-to-sequence model (S2SVR) to address this problem. On the one hand, the sequence-to-sequence model, which has proven capable of sequence modeling in the field of natural language processing, is explored for the first time in VR. Optimized serialization modeling shows potential in capturing long-range dependencies among frames. On the other hand, we equip the sequence-to-sequence model with an unsupervised optical flow estimator to maximize its potential. The flow estimator is trained with our proposed unsupervised distillation loss, which can alleviate the data discrepancy and inaccurate degraded optical flow issues of previous flow-based methods. With reliable optical flow, we can establish accurate correspondence among multiple frames, narrowing the domain difference between 1D language and 2D misaligned frames and improving the potential of the sequence-to-sequence model. S2SVR shows superior performance in multiple VR tasks, including video deblurring, video super-resolution, and compressed video quality enhancement. https://github.com/linjing7/VR-Baseline}\n}", "pdf": "https://proceedings.mlr.press/v162/lin22d/lin22d.pdf", "supp": "", "pdf_size": 2958794, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11447631455312360639&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Shenzhen International Graduate School, Tsinghua University; Shenzhen International Graduate School, Tsinghua University; Shenzhen International Graduate School, Tsinghua University; Shenzhen International Graduate School, Tsinghua University; Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; ETH Z\u00fcrich; ETH Z\u00fcrich", "aff_domain": "tsinghua.edu.cn; ; ; ;huawei.com; ; ; ", "email": "tsinghua.edu.cn; ; ; ;huawei.com; ; ; ", "github": "https://github.com/linjing7/VR-Baseline", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/lin22d.html", "aff_unique_index": "0;0;0;0;1;1;2;2", "aff_unique_norm": "Tsinghua University;Huawei;ETH Zurich", "aff_unique_dep": "Shenzhen International Graduate School;Noah\u2019s Ark Lab;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.huawei.com;https://www.ethz.ch", "aff_unique_abbr": "THU;Huawei;ETHZ", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Shenzhen;", "aff_country_unique_index": "0;0;0;0;0;0;1;1", "aff_country_unique": "China;Switzerland" }, { "title": "Unsupervised Ground Metric Learning Using Wasserstein Singular Vectors", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18307", "id": "18307", "proceeding": "https://proceedings.mlr.press/v162/huizing22a.html", "poster": "/media/PosterPDFs/ICML%202022/ecb287ff763c169694f682af52c1f309.png?t=1657797900.5335064", "slides": "", "author_site": "Geert-Jan Huizing, Laura Cantini, Gabriel Peyr\u00e9", "author": "Geert-Jan Huizing; Laura Cantini; Gabriel Peyr\u00e9", "abstract": "Defining meaningful distances between samples in a dataset is a fundamental problem in machine learning. Optimal Transport (OT) lifts a distance between features (the \"ground metric\") to a geometrically meaningful distance between samples. However, there is usually no straightforward choice of ground metric. Supervised ground metric learning approaches exist but require labeled data. In absence of labels, only ad-hoc ground metrics remain. Unsupervised ground metric learning is thus a fundamental problem to enable data-driven applications of OT. In this paper, we propose for the first time a canonical answer by simultaneously computing an OT distance between samples and between features of a dataset. These distance matrices emerge naturally as positive singular vectors of the function mapping ground metrics to OT distances. We provide criteria to ensure the existence and uniqueness of these singular vectors. We then introduce scalable computational methods to approximate them in high-dimensional settings, using stochastic approximation and entropic regularization. Finally, we showcase Wasserstein Singular Vectors on a single-cell RNA-sequencing dataset.", "bibtex": "@InProceedings{pmlr-v162-huizing22a,\n title = \t {Unsupervised Ground Metric Learning Using {W}asserstein Singular Vectors},\n author = {Huizing, Geert-Jan and Cantini, Laura and Peyr{\\'e}, Gabriel},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {9429--9443},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/huizing22a/huizing22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/huizing22a.html},\n abstract = \t {Defining meaningful distances between samples in a dataset is a fundamental problem in machine learning. Optimal Transport (OT) lifts a distance between features (the \"ground metric\") to a geometrically meaningful distance between samples. However, there is usually no straightforward choice of ground metric. Supervised ground metric learning approaches exist but require labeled data. In absence of labels, only ad-hoc ground metrics remain. Unsupervised ground metric learning is thus a fundamental problem to enable data-driven applications of OT. In this paper, we propose for the first time a canonical answer by simultaneously computing an OT distance between samples and between features of a dataset. These distance matrices emerge naturally as positive singular vectors of the function mapping ground metrics to OT distances. We provide criteria to ensure the existence and uniqueness of these singular vectors. We then introduce scalable computational methods to approximate them in high-dimensional settings, using stochastic approximation and entropic regularization. Finally, we showcase Wasserstein Singular Vectors on a single-cell RNA-sequencing dataset.}\n}", "pdf": "https://proceedings.mlr.press/v162/huizing22a/huizing22a.pdf", "supp": "", "pdf_size": 2710845, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15888088169122917171&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "D\u00e9partement de math\u00e9matiques et applications de l\u2019Ecole Normale Sup\u00e9rieure, CNRS, Ecole Normale Sup\u00e9rieure, Universit\u00e9 PSL, 75005, Paris, France+Computational Systems Biology Team, Institut de Biologie de l\u2019Ecole Normale Sup\u00e9rieure, CNRS, INSERM, Ecole Normale Sup\u00e9rieure, Universit\u00e9 PSL, 75005, Paris, France; Computational Systems Biology Team, Institut de Biologie de l\u2019Ecole Normale Sup\u00e9rieure, CNRS, INSERM, Ecole Normale Sup\u00e9rieure, Universit\u00e9 PSL, 75005, Paris, France; D\u00e9partement de math\u00e9matiques et applications de l\u2019Ecole Normale Sup\u00e9rieure, CNRS, Ecole Normale Sup\u00e9rieure, Universit\u00e9 PSL, 75005, Paris, France", "aff_domain": "ens.fr; ;ens.fr", "email": "ens.fr; ;ens.fr", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/huizing22a.html", "aff_unique_index": "0+1;1;0", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure;Institut de Biologie de l\u2019Ecole Normale Sup\u00e9rieure", "aff_unique_dep": "D\u00e9partement de math\u00e9matiques et applications;Computational Systems Biology Team", "aff_unique_url": "https://www.ens.fr;", "aff_unique_abbr": "ENS;", "aff_campus_unique_index": "0+0;0;0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "France" }, { "title": "Unsupervised Image Representation Learning with Deep Latent Particles", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17267", "id": "17267", "proceeding": "https://proceedings.mlr.press/v162/daniel22a.html", "poster": "/media/PosterPDFs/ICML%202022/eded0708dfe855304a50029fccf1a677.png?t=1656832366.8915522", "slides": "/media/icml-2022/Slides/17267.pdf", "author_site": "Tal Daniel, Aviv Tamar", "author": "Tal Daniel; Aviv Tamar", "abstract": "We propose a new representation of visual data that disentangles object position from appearance. Our method, termed Deep Latent Particles (DLP), decomposes the visual input into low-dimensional latent \u201cparticles\u201d, where each particle is described by its spatial location and features of its surrounding region. To drive learning of such representations, we follow a VAE-based based approach and introduce a prior for particle positions based on a spatial-Softmax architecture, and a modification of the evidence lower bound loss inspired by the Chamfer distance between particles. We demonstrate that our DLP representations are useful for downstream tasks such as unsupervised keypoint (KP) detection, image manipulation, and video prediction for scenes composed of multiple dynamic objects. In addition, we show that our probabilistic interpretation of the problem naturally provides uncertainty estimates for particle locations, which can be used for model selection, among other tasks.", "bibtex": "@InProceedings{pmlr-v162-daniel22a,\n title = \t {Unsupervised Image Representation Learning with Deep Latent Particles},\n author = {Daniel, Tal and Tamar, Aviv},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {4644--4665},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/daniel22a/daniel22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/daniel22a.html},\n abstract = \t {We propose a new representation of visual data that disentangles object position from appearance. Our method, termed Deep Latent Particles (DLP), decomposes the visual input into low-dimensional latent \u201cparticles\u201d, where each particle is described by its spatial location and features of its surrounding region. To drive learning of such representations, we follow a VAE-based based approach and introduce a prior for particle positions based on a spatial-Softmax architecture, and a modification of the evidence lower bound loss inspired by the Chamfer distance between particles. We demonstrate that our DLP representations are useful for downstream tasks such as unsupervised keypoint (KP) detection, image manipulation, and video prediction for scenes composed of multiple dynamic objects. In addition, we show that our probabilistic interpretation of the problem naturally provides uncertainty estimates for particle locations, which can be used for model selection, among other tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/daniel22a/daniel22a.pdf", "supp": "", "pdf_size": 13950677, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8443981998714808027&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Electrical and Computer Engineering, Technion - Israel Institute of Technology, Haifa, Israel; Department of Electrical and Computer Engineering, Technion - Israel Institute of Technology, Haifa, Israel", "aff_domain": "campus.technion.ac.il; ", "email": "campus.technion.ac.il; ", "github": "https://taldatech.github.io/deep-latent-particles-web/", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/daniel22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.technion.ac.il", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Haifa", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Unsupervised Time-Series Representation Learning with Iterative Bilinear Temporal-Spectral Fusion", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16051", "id": "16051", "proceeding": "https://proceedings.mlr.press/v162/yang22e.html", "poster": "/media/PosterPDFs/ICML%202022/009c434cab57de48a31f6b669e7ba266_hPcNn6H.png?t=1657158905.0398924", "slides": "/media/icml-2022/Slides/16051.pdf", "author_site": "Ling Yang, Shenda Hong", "author": "Ling Yang; Shenda Hong", "abstract": "Unsupervised/self-supervised time series representation learning is a challenging problem because of its complex dynamics and sparse annotations. Existing works mainly adopt the framework of contrastive learning with the time-based augmentation techniques to sample positives and negatives for contrastive training. Nevertheless, they mostly use segment-level augmentation derived from time slicing, which may bring about sampling bias and incorrect optimization with false negatives due to the loss of global context. Besides, they all pay no attention to incorporate the spectral information in feature representation. In this paper, we propose a unified framework, namely Bilinear Temporal-Spectral Fusion (BTSF). Specifically, we firstly utilize the instance-level augmentation with a simple dropout on the entire time series for maximally capturing long-term dependencies. We devise a novel iterative bilinear temporal-spectral fusion to explicitly encode the affinities of abundant time-frequency pairs, and iteratively refines representations in a fusion-and-squeeze manner with Spectrum-to-Time (S2T) and Time-to-Spectrum (T2S) Aggregation modules. We firstly conducts downstream evaluations on three major tasks for time series including classification, forecasting and anomaly detection. Experimental results shows that our BTSF consistently significantly outperforms the state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v162-yang22e,\n title = \t {Unsupervised Time-Series Representation Learning with Iterative Bilinear Temporal-Spectral Fusion},\n author = {Yang, Ling and Hong, Shenda},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {25038--25054},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/yang22e/yang22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/yang22e.html},\n abstract = \t {Unsupervised/self-supervised time series representation learning is a challenging problem because of its complex dynamics and sparse annotations. Existing works mainly adopt the framework of contrastive learning with the time-based augmentation techniques to sample positives and negatives for contrastive training. Nevertheless, they mostly use segment-level augmentation derived from time slicing, which may bring about sampling bias and incorrect optimization with false negatives due to the loss of global context. Besides, they all pay no attention to incorporate the spectral information in feature representation. In this paper, we propose a unified framework, namely Bilinear Temporal-Spectral Fusion (BTSF). Specifically, we firstly utilize the instance-level augmentation with a simple dropout on the entire time series for maximally capturing long-term dependencies. We devise a novel iterative bilinear temporal-spectral fusion to explicitly encode the affinities of abundant time-frequency pairs, and iteratively refines representations in a fusion-and-squeeze manner with Spectrum-to-Time (S2T) and Time-to-Spectrum (T2S) Aggregation modules. We firstly conducts downstream evaluations on three major tasks for time series including classification, forecasting and anomaly detection. Experimental results shows that our BTSF consistently significantly outperforms the state-of-the-art methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/yang22e/yang22e.pdf", "supp": "", "pdf_size": 1145167, "gs_citation": 151, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=650024593787992484&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "National Institute of Health Data Science, Peking University, Beijing, China+Institute of Medical Technology, Health Science Center of Peking University, Beijing, China; National Institute of Health Data Science, Peking University, Beijing, China+Institute of Medical Technology, Health Science Center of Peking University, Beijing, China", "aff_domain": "163.com;pku.edu.cn", "email": "163.com;pku.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/yang22e.html", "aff_unique_index": "0+0;0+0", "aff_unique_norm": "Peking University", "aff_unique_dep": "National Institute of Health Data Science", "aff_unique_url": "http://www.pku.edu.cn", "aff_unique_abbr": "PKU", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "China" }, { "title": "Utility Theory for Sequential Decision Making", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16847", "id": "16847", "proceeding": "https://proceedings.mlr.press/v162/shakerinava22a.html", "poster": "", "slides": "", "author_site": "Mehran Shakerinava, Siamak Ravanbakhsh", "author": "Mehran Shakerinava; Siamak Ravanbakhsh", "abstract": "The von Neumann-Morgenstern (VNM) utility theorem shows that under certain axioms of rationality, decision-making is reduced to maximizing the expectation of some utility function. We extend these axioms to increasingly structured sequential decision making settings and identify the structure of the corresponding utility functions. In particular, we show that memoryless preferences lead to a utility in the form of a per transition reward and multiplicative factor on the future return. This result motivates a generalization of Markov Decision Processes (MDPs) with this structure on the agent\u2019s returns, which we call Affine-Reward MDPs. A stronger constraint on preferences is needed to recover the commonly used cumulative sum of scalar rewards in MDPs. A yet stronger constraint simplifies the utility function for goal-seeking agents in the form of a difference in some function of states that we call potential functions. Our necessary and sufficient conditions demystify the reward hypothesis that underlies the design of rational agents in reinforcement learning by adding an axiom to the VNM rationality axioms and motivates new directions for AI research involving sequential decision making.", "bibtex": "@InProceedings{pmlr-v162-shakerinava22a,\n title = \t {Utility Theory for Sequential Decision Making},\n author = {Shakerinava, Mehran and Ravanbakhsh, Siamak},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19616--19625},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shakerinava22a/shakerinava22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/shakerinava22a.html},\n abstract = \t {The von Neumann-Morgenstern (VNM) utility theorem shows that under certain axioms of rationality, decision-making is reduced to maximizing the expectation of some utility function. We extend these axioms to increasingly structured sequential decision making settings and identify the structure of the corresponding utility functions. In particular, we show that memoryless preferences lead to a utility in the form of a per transition reward and multiplicative factor on the future return. This result motivates a generalization of Markov Decision Processes (MDPs) with this structure on the agent\u2019s returns, which we call Affine-Reward MDPs. A stronger constraint on preferences is needed to recover the commonly used cumulative sum of scalar rewards in MDPs. A yet stronger constraint simplifies the utility function for goal-seeking agents in the form of a difference in some function of states that we call potential functions. Our necessary and sufficient conditions demystify the reward hypothesis that underlies the design of rational agents in reinforcement learning by adding an axiom to the VNM rationality axioms and motivates new directions for AI research involving sequential decision making.}\n}", "pdf": "https://proceedings.mlr.press/v162/shakerinava22a/shakerinava22a.pdf", "supp": "", "pdf_size": 323612, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5113770047826982186&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "School of Computer Science, McGill University, Montreal, Canada + Mila - Quebec AI Institute; School of Computer Science, McGill University, Montreal, Canada + Mila - Quebec AI Institute", "aff_domain": "mila.quebec; ", "email": "mila.quebec; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/shakerinava22a.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "McGill University;Quebec AI Institute", "aff_unique_dep": "School of Computer Science;AI Institute", "aff_unique_url": "https://www.mcgill.ca;https://mila.quebec", "aff_unique_abbr": "McGill;Mila", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "Canada" }, { "title": "Utilizing Expert Features for Contrastive Learning of Time-Series Representations", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18037", "id": "18037", "proceeding": "https://proceedings.mlr.press/v162/nonnenmacher22a.html", "poster": "/media/PosterPDFs/ICML%202022/14db62200d8bf46551aa214accafe1df.png?t=1657519020.917496", "slides": "", "author_site": "Manuel Nonnenmacher, Lukas Oldenburg, Ingo Steinwart, David Reeb", "author": "Manuel T Nonnenmacher; Lukas Oldenburg; Ingo Steinwart; David Reeb", "abstract": "We present an approach that incorporates expert knowledge for time-series representation learning. Our method employs expert features to replace the commonly used data transformations in previous contrastive learning approaches. We do this since time-series data frequently stems from the industrial or medical field where expert features are often available from domain experts, while transformations are generally elusive for time-series data. We start by proposing two properties that useful time-series representations should fulfill and show that current representation learning approaches do not ensure these properties. We therefore devise ExpCLR, a novel contrastive learning approach built on an objective that utilizes expert features to encourage both properties for the learned representation. Finally, we demonstrate on three real-world time-series datasets that ExpCLR surpasses several state-of-the-art methods for both unsupervised and semi-supervised representation learning.", "bibtex": "@InProceedings{pmlr-v162-nonnenmacher22a,\n title = \t {Utilizing Expert Features for Contrastive Learning of Time-Series Representations},\n author = {Nonnenmacher, Manuel T and Oldenburg, Lukas and Steinwart, Ingo and Reeb, David},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16969--16989},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nonnenmacher22a/nonnenmacher22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nonnenmacher22a.html},\n abstract = \t {We present an approach that incorporates expert knowledge for time-series representation learning. Our method employs expert features to replace the commonly used data transformations in previous contrastive learning approaches. We do this since time-series data frequently stems from the industrial or medical field where expert features are often available from domain experts, while transformations are generally elusive for time-series data. We start by proposing two properties that useful time-series representations should fulfill and show that current representation learning approaches do not ensure these properties. We therefore devise ExpCLR, a novel contrastive learning approach built on an objective that utilizes expert features to encourage both properties for the learned representation. Finally, we demonstrate on three real-world time-series datasets that ExpCLR surpasses several state-of-the-art methods for both unsupervised and semi-supervised representation learning.}\n}", "pdf": "https://proceedings.mlr.press/v162/nonnenmacher22a/nonnenmacher22a.pdf", "supp": "", "pdf_size": 525962, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16790455232498977165&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Bosch Center for Artificial Intelligence (BCAI), Robert Bosch GmbH, Renningen, Germany+Institute for Stochastics and Applications, University of Stuttgart, Stuttgart, Germany; Bosch Center for Artificial Intelligence (BCAI), Robert Bosch GmbH, Renningen, Germany; Institute for Stochastics and Applications, University of Stuttgart, Stuttgart, Germany; Bosch Center for Artificial Intelligence (BCAI), Robert Bosch GmbH, Renningen, Germany", "aff_domain": "de.bosch.com; ; ; ", "email": "de.bosch.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/nonnenmacher22a.html", "aff_unique_index": "0+1;0;1;0", "aff_unique_norm": "Robert Bosch GmbH;University of Stuttgart", "aff_unique_dep": "Bosch Center for Artificial Intelligence (BCAI);Institute for Stochastics and Applications", "aff_unique_url": "https://www.bosch.com;https://www.uni-stuttgart.de", "aff_unique_abbr": "BCAI;", "aff_campus_unique_index": "0+1;0;1;0", "aff_campus_unique": "Renningen;Stuttgart", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "Germany" }, { "title": "VLMixer: Unpaired Vision-Language Pre-training via Cross-Modal CutMix", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18165", "id": "18165", "proceeding": "https://proceedings.mlr.press/v162/wang22h.html", "poster": "/media/PosterPDFs/ICML%202022/8f121ce07d74717e0b1f21d122e04521.png?t=1656584670.3907514", "slides": "/media/icml-2022/Slides/18165.pdf", "author_site": "Teng Wang, Wenhao Jiang, Zhichao Lu, Feng Zheng, Ran Cheng, chengguo yin, Ping Luo", "author": "Teng Wang; Wenhao Jiang; Zhichao Lu; Feng Zheng; Ran Cheng; Chengguo Yin; Ping Luo", "abstract": "Existing vision-language pre-training (VLP) methods primarily rely on paired image-text datasets, which are either annotated by enormous human labors or crawled from the internet followed by elaborate data cleaning techniques. To reduce the dependency on well-aligned image-text pairs, it is promising to directly leverage the large-scale text-only and image-only corpora. This paper proposes a data augmentation method, namely cross-modal CutMix (CMC), for implicit cross-modal alignment learning in unpaired VLP. Specifically, CMC transforms natural sentences in the textual view into a multi-modal view, where visually-grounded words in a sentence are randomly replaced by diverse image patches with similar semantics. There are several appealing proprieties of the proposed CMC. First, it enhances the data diversity while keeping the semantic meaning intact for tackling problems where the aligned data are scarce; Second, by attaching cross-modal noise on uni-modal data, it guides models to learn token-level interactions across modalities for better denoising. Furthermore, we present a new unpaired VLP method, dubbed as VLMixer, that integrates CMC with contrastive learning to pull together the uni-modal and multi-modal views for better instance-level alignments among different modalities. Extensive experiments on five downstream tasks show that VLMixer could surpass previous state-of-the-art unpaired VLP methods.", "bibtex": "@InProceedings{pmlr-v162-wang22h,\n title = \t {{VLM}ixer: Unpaired Vision-Language Pre-training via Cross-Modal {C}ut{M}ix},\n author = {Wang, Teng and Jiang, Wenhao and Lu, Zhichao and Zheng, Feng and Cheng, Ran and Yin, Chengguo and Luo, Ping},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22680--22690},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22h/wang22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22h.html},\n abstract = \t {Existing vision-language pre-training (VLP) methods primarily rely on paired image-text datasets, which are either annotated by enormous human labors or crawled from the internet followed by elaborate data cleaning techniques. To reduce the dependency on well-aligned image-text pairs, it is promising to directly leverage the large-scale text-only and image-only corpora. This paper proposes a data augmentation method, namely cross-modal CutMix (CMC), for implicit cross-modal alignment learning in unpaired VLP. Specifically, CMC transforms natural sentences in the textual view into a multi-modal view, where visually-grounded words in a sentence are randomly replaced by diverse image patches with similar semantics. There are several appealing proprieties of the proposed CMC. First, it enhances the data diversity while keeping the semantic meaning intact for tackling problems where the aligned data are scarce; Second, by attaching cross-modal noise on uni-modal data, it guides models to learn token-level interactions across modalities for better denoising. Furthermore, we present a new unpaired VLP method, dubbed as VLMixer, that integrates CMC with contrastive learning to pull together the uni-modal and multi-modal views for better instance-level alignments among different modalities. Extensive experiments on five downstream tasks show that VLMixer could surpass previous state-of-the-art unpaired VLP methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22h/wang22h.pdf", "supp": "", "pdf_size": 529725, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6137962123845990063&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "https://github.com/ttengwang/VLMixer", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/wang22h.html" }, { "title": "VLUE: A Multi-Task Multi-Dimension Benchmark for Evaluating Vision-Language Pre-training", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17677", "id": "17677", "proceeding": "https://proceedings.mlr.press/v162/zhou22n.html", "poster": "", "slides": "", "author_site": "Wangchunshu Zhou, Yan Zeng, shizhe diao, Xinsong Zhang", "author": "Wangchunshu Zhou; Yan Zeng; Shizhe Diao; Xinsong Zhang", "abstract": "Recent advances in vision-language pre-training (VLP) have demonstrated impressive performance in a range of vision-language (VL) tasks. However, there exist several challenges for measuring the community\u2019s progress in building general multi-modal intelligence. First, most of the downstream VL datasets are annotated using raw images that are already seen during pre-training, which may result in an overestimation of current VLP models\u2019 generalization ability. Second, recent VLP work mainly focuses on absolute performance but overlooks the efficiency-performance trade-off, which is also an important indicator for measuring progress. To this end, we introduce the Vision-Language Understanding Evaluation (VLUE) benchmark, a multi-task multi-dimension benchmark for evaluating the generalization capabilities and the efficiency-performance trade-off (\u201cPareto SOTA\u201d) of VLP models. We demonstrate that there is a sizable generalization gap for all VLP models when testing on out-of-distribution test sets annotated on images from a more diverse distribution that spreads across cultures. Moreover, we find that measuring the efficiency-performance trade-off of VLP models leads to complementary insights for several design choices of VLP. We release the VLUE benchmark to promote research on building vision-language models that generalize well to images unseen during pre-training and are practical in terms of efficiency-performance trade-off.", "bibtex": "@InProceedings{pmlr-v162-zhou22n,\n title = \t {{VLUE}: A Multi-Task Multi-Dimension Benchmark for Evaluating Vision-Language Pre-training},\n author = {Zhou, Wangchunshu and Zeng, Yan and Diao, Shizhe and Zhang, Xinsong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27395--27411},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhou22n/zhou22n.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhou22n.html},\n abstract = \t {Recent advances in vision-language pre-training (VLP) have demonstrated impressive performance in a range of vision-language (VL) tasks. However, there exist several challenges for measuring the community\u2019s progress in building general multi-modal intelligence. First, most of the downstream VL datasets are annotated using raw images that are already seen during pre-training, which may result in an overestimation of current VLP models\u2019 generalization ability. Second, recent VLP work mainly focuses on absolute performance but overlooks the efficiency-performance trade-off, which is also an important indicator for measuring progress. To this end, we introduce the Vision-Language Understanding Evaluation (VLUE) benchmark, a multi-task multi-dimension benchmark for evaluating the generalization capabilities and the efficiency-performance trade-off (\u201cPareto SOTA\u201d) of VLP models. We demonstrate that there is a sizable generalization gap for all VLP models when testing on out-of-distribution test sets annotated on images from a more diverse distribution that spreads across cultures. Moreover, we find that measuring the efficiency-performance trade-off of VLP models leads to complementary insights for several design choices of VLP. We release the VLUE benchmark to promote research on building vision-language models that generalize well to images unseen during pre-training and are practical in terms of efficiency-performance trade-off.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhou22n/zhou22n.pdf", "supp": "", "pdf_size": 3844580, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8276074346037258646&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff": "ByteDance AI Lab; ByteDance AI Lab; The Hong Kong University of Science and Technology; ByteDance AI Lab", "aff_domain": "outlook.com; ; ; ", "email": "outlook.com; ; ; ", "github": "https://github.com/MichaelZhouwang/VLUE", "project": "https://vlue-benchmark.github.io", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhou22n.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "ByteDance;Hong Kong University of Science and Technology", "aff_unique_dep": "AI Lab;", "aff_unique_url": "https://www.bytedance.com;https://www.ust.hk", "aff_unique_abbr": "ByteDance;HKUST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Validating Causal Inference Methods", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16511", "id": "16511", "proceeding": "https://proceedings.mlr.press/v162/parikh22a.html", "poster": "/media/PosterPDFs/ICML%202022/87ba276ebbe553ec05d2f5b37c20125f_qLknJFB.png?t=1657933009.5594323", "slides": "/media/icml-2022/Slides/16511.pdf", "author_site": "Harsh Parikh, Carlos Varjao, Louise Xu, Eric Tchetgen Tchetgen", "author": "Harsh Parikh; Carlos Varjao; Louise Xu; Eric Tchetgen Tchetgen", "abstract": "The fundamental challenge of drawing causal inference is that counterfactual outcomes are not fully observed for any unit. Furthermore, in observational studies, treatment assignment is likely to be confounded. Many statistical methods have emerged for causal inference under unconfoundedness conditions given pre-treatment covariates, including propensity score-based methods, prognostic score-based methods, and doubly robust methods. Unfortunately for applied researchers, there is no \u2018one-size-fits-all\u2019 causal method that can perform optimally universally. In practice, causal methods are primarily evaluated quantitatively on handcrafted simulated data. Such data-generative procedures can be of limited value because they are typically stylized models of reality. They are simplified for tractability and lack the complexities of real-world data. For applied researchers, it is critical to understand how well a method performs for the data at hand. Our work introduces a deep generative model-based framework, Credence, to validate causal inference methods. The framework\u2019s novelty stems from its ability to generate synthetic data anchored at the empirical distribution for the observed sample, and therefore virtually indistinguishable from the latter. The approach allows the user to specify ground truth for the form and magnitude of causal effects and confounding bias as functions of covariates. Thus simulated data sets are used to evaluate the potential performance of various causal estimation methods when applied to data similar to the observed sample. We demonstrate Credence\u2019s ability to accurately assess the relative performance of causal estimation techniques in an extensive simulation study and two real-world data applications from Lalonde and Project STAR studies.", "bibtex": "@InProceedings{pmlr-v162-parikh22a,\n title = \t {Validating Causal Inference Methods},\n author = {Parikh, Harsh and Varjao, Carlos and Xu, Louise and Tchetgen, Eric Tchetgen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17346--17358},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/parikh22a/parikh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/parikh22a.html},\n abstract = \t {The fundamental challenge of drawing causal inference is that counterfactual outcomes are not fully observed for any unit. Furthermore, in observational studies, treatment assignment is likely to be confounded. Many statistical methods have emerged for causal inference under unconfoundedness conditions given pre-treatment covariates, including propensity score-based methods, prognostic score-based methods, and doubly robust methods. Unfortunately for applied researchers, there is no \u2018one-size-fits-all\u2019 causal method that can perform optimally universally. In practice, causal methods are primarily evaluated quantitatively on handcrafted simulated data. Such data-generative procedures can be of limited value because they are typically stylized models of reality. They are simplified for tractability and lack the complexities of real-world data. For applied researchers, it is critical to understand how well a method performs for the data at hand. Our work introduces a deep generative model-based framework, Credence, to validate causal inference methods. The framework\u2019s novelty stems from its ability to generate synthetic data anchored at the empirical distribution for the observed sample, and therefore virtually indistinguishable from the latter. The approach allows the user to specify ground truth for the form and magnitude of causal effects and confounding bias as functions of covariates. Thus simulated data sets are used to evaluate the potential performance of various causal estimation methods when applied to data similar to the observed sample. We demonstrate Credence\u2019s ability to accurately assess the relative performance of causal estimation techniques in an extensive simulation study and two real-world data applications from Lalonde and Project STAR studies.}\n}", "pdf": "https://proceedings.mlr.press/v162/parikh22a/parikh22a.pdf", "supp": "", "pdf_size": 1394283, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17558404505134173796&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Duke University; Amazon.com; Amazon.com; The Wharton School, University of Pennsylvania", "aff_domain": "duke.edu; ; ; ", "email": "duke.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/parikh22a.html", "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Duke University;Amazon;University of Pennsylvania", "aff_unique_dep": ";Amazon;The Wharton School", "aff_unique_url": "https://www.duke.edu;https://www.amazon.com;https://www.wharton.upenn.edu", "aff_unique_abbr": "Duke;Amazon;UPenn Wharton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Value Function based Difference-of-Convex Algorithm for Bilevel Hyperparameter Selection Problems", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17373", "id": "17373", "proceeding": "https://proceedings.mlr.press/v162/gao22j.html", "poster": "/media/PosterPDFs/ICML%202022/b56a18e0eacdf51aa2a5306b0f533204.png?t=1657589925.681198", "slides": "", "author_site": "Lucy Gao, Jane J. Ye, Haian Yin, Shangzhi Zeng, Jin Zhang", "author": "Lucy L Gao; Jane Ye; Haian Yin; Shangzhi Zeng; Jin Zhang", "abstract": "Existing gradient-based optimization methods for hyperparameter tuning can only guarantee theoretical convergence to stationary solutions when the bilevel program satisfies the condition that for fixed upper-level variables, the lower-level is strongly convex (LLSC) and smooth (LLS). This condition is not satisfied for bilevel programs arising from tuning hyperparameters in many machine learning algorithms. In this work, we develop a sequentially convergent Value Function based Difference-of-Convex Algorithm with inexactness (VF-iDCA). We then ask: can this algorithm achieve stationary solutions without LLSC and LLS assumptions? We provide a positive answer to this question for bilevel programs from a broad class of hyperparameter tuning applications. Extensive experiments justify our theoretical results and demonstrate the superiority of the proposed VF-iDCA when applied to tune hyperparameters.", "bibtex": "@InProceedings{pmlr-v162-gao22j,\n title = \t {Value Function based Difference-of-Convex Algorithm for Bilevel Hyperparameter Selection Problems},\n author = {Gao, Lucy L and Ye, Jane and Yin, Haian and Zeng, Shangzhi and Zhang, Jin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7164--7182},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gao22j/gao22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/gao22j.html},\n abstract = \t {Existing gradient-based optimization methods for hyperparameter tuning can only guarantee theoretical convergence to stationary solutions when the bilevel program satisfies the condition that for fixed upper-level variables, the lower-level is strongly convex (LLSC) and smooth (LLS). This condition is not satisfied for bilevel programs arising from tuning hyperparameters in many machine learning algorithms. In this work, we develop a sequentially convergent Value Function based Difference-of-Convex Algorithm with inexactness (VF-iDCA). We then ask: can this algorithm achieve stationary solutions without LLSC and LLS assumptions? We provide a positive answer to this question for bilevel programs from a broad class of hyperparameter tuning applications. Extensive experiments justify our theoretical results and demonstrate the superiority of the proposed VF-iDCA when applied to tune hyperparameters.}\n}", "pdf": "https://proceedings.mlr.press/v162/gao22j/gao22j.pdf", "supp": "", "pdf_size": 771656, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5559492833861486776&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Statistics and Actuarial Science, University of Waterloo, Waterloo, Ontario, Canada; Department of Mathematics and Statistics, University of Victoria, Victoria, British Columbia, Canada; Department of Mathematics, SUSTech International Center for Mathematics, Southern University of Science and Technology, Shenzhen, Guangdong, China + National Center for Applied Mathematics Shenzhen, Shenzhen, Guangdong, China; Department of Mathematics and Statistics, University of Victoria, Victoria, British Columbia, Canada; Department of Mathematics, SUSTech International Center for Mathematics, Southern University of Science and Technology, Shenzhen, Guangdong, China + National Center for Applied Mathematics Shenzhen, Shenzhen, Guangdong, China", "aff_domain": "sustech.edu.cn; ; ; ; ", "email": "sustech.edu.cn; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/gao22j.html", "aff_unique_index": "0;1;2+3;1;2+3", "aff_unique_norm": "University of Waterloo;University of Victoria;Southern University of Science and Technology;National Center for Applied Mathematics", "aff_unique_dep": "Department of Statistics and Actuarial Science;Department of Mathematics and Statistics;Department of Mathematics;", "aff_unique_url": "https://uwaterloo.ca;https://www.uvic.ca;https://www.sustech.edu.cn;", "aff_unique_abbr": "UWaterloo;UVic;SUSTech;", "aff_campus_unique_index": "0;1;2+2;1;2+2", "aff_campus_unique": "Waterloo;Victoria;Shenzhen", "aff_country_unique_index": "0;0;1+1;0;1+1", "aff_country_unique": "Canada;China" }, { "title": "VarScene: A Deep Generative Model for Realistic Scene Graph Synthesis", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16891", "id": "16891", "proceeding": "https://proceedings.mlr.press/v162/verma22b.html", "poster": "/media/PosterPDFs/ICML%202022/1becf26e9f32353e30870060538746e7.png?t=1658340929.2500288", "slides": "", "author_site": "Tathagat Verma, Abir De, Yateesh Agrawal, Vishwa Vinay, Soumen Chakrabarti", "author": "Tathagat Verma; Abir De; Yateesh Agrawal; Vishwa Vinay; Soumen Chakrabarti", "abstract": "Scene graphs are powerful abstractions that capture relationships between objects in images by modeling objects as nodes and relationships as edges. Generation of realistic synthetic scene graphs has applications like scene synthesis and data augmentation for supervised learning. Existing graph generative models are predominantly targeted toward molecular graphs, leveraging the limited vocabulary of atoms and bonds and also the well-defined semantics of chemical compounds. In contrast, scene graphs have much larger object and relation vocabularies, and their semantics are latent. To address this challenge, we propose a variational autoencoder for scene graphs, which is optimized for the maximum mean discrepancy (MMD) between the ground truth scene graph distribution and distribution of the generated scene graphs. Our method views a scene graph as a collection of star graphs and encodes it into a latent representation of the underlying stars. The decoder generates scene graphs by learning to sample the component stars and edges between them. Our experiments show that our method is able to mimic the underlying scene graph generative process more accurately than several state-of-the-art baselines.", "bibtex": "@InProceedings{pmlr-v162-verma22b,\n title = \t {{V}ar{S}cene: A Deep Generative Model for Realistic Scene Graph Synthesis},\n author = {Verma, Tathagat and De, Abir and Agrawal, Yateesh and Vinay, Vishwa and Chakrabarti, Soumen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22168--22183},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/verma22b/verma22b.pdf},\n url = \t {https://proceedings.mlr.press/v162/verma22b.html},\n abstract = \t {Scene graphs are powerful abstractions that capture relationships between objects in images by modeling objects as nodes and relationships as edges. Generation of realistic synthetic scene graphs has applications like scene synthesis and data augmentation for supervised learning. Existing graph generative models are predominantly targeted toward molecular graphs, leveraging the limited vocabulary of atoms and bonds and also the well-defined semantics of chemical compounds. In contrast, scene graphs have much larger object and relation vocabularies, and their semantics are latent. To address this challenge, we propose a variational autoencoder for scene graphs, which is optimized for the maximum mean discrepancy (MMD) between the ground truth scene graph distribution and distribution of the generated scene graphs. Our method views a scene graph as a collection of star graphs and encodes it into a latent representation of the underlying stars. The decoder generates scene graphs by learning to sample the component stars and edges between them. Our experiments show that our method is able to mimic the underlying scene graph generative process more accurately than several state-of-the-art baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/verma22b/verma22b.pdf", "supp": "", "pdf_size": 1928165, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4123688367189271676&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": "Department of Computer Science and Engineering, Indian Institute of Technology Bombay, Mumbai, India; Department of Computer Science and Engineering, Indian Institute of Technology Bombay, Mumbai, India; Department of Computer Science and Engineering, Indian Institute of Technology Bombay, Mumbai, India; Adobe Research, India; Department of Computer Science and Engineering, Indian Institute of Technology Bombay, Mumbai, India", "aff_domain": "cse.iitb.ac.in;cse.iitb.ac.in;cse.iitb.ac.in;adobe.com;cse.iitb.ac.in", "email": "cse.iitb.ac.in;cse.iitb.ac.in;cse.iitb.ac.in;adobe.com;cse.iitb.ac.in", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/verma22b.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Indian Institute of Technology Bombay;Adobe", "aff_unique_dep": "Department of Computer Science and Engineering;Adobe Research", "aff_unique_url": "https://www.iitb.ac.in;https://research.adobe.com", "aff_unique_abbr": "IIT Bombay;Adobe", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mumbai;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "India" }, { "title": "VariGrow: Variational Architecture Growing for Task-Agnostic Continual Learning based on Bayesian Novelty", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16777", "id": "16777", "proceeding": "https://proceedings.mlr.press/v162/ardywibowo22a.html", "poster": "/media/PosterPDFs/ICML%202022/2118d8a1b7004ed5baf5347a4f99f502_VBoOFJE.png?t=1657493684.7108188", "slides": "", "author_site": "Randy Ardywibowo, Zepeng Huo, Zhangyang \u201cAtlas\u201d Wang, Bobak Mortazavi, Shuai Huang, Xiaoning Qian", "author": "Randy Ardywibowo; Zepeng Huo; Zhangyang Wang; Bobak J Mortazavi; Shuai Huang; Xiaoning Qian", "abstract": "Continual Learning (CL) is the problem of sequentially learning a set of tasks and preserving all the knowledge acquired. Many existing methods assume that the data stream is explicitly divided into a sequence of known contexts (tasks), and use this information to know when to transfer knowledge from one context to another. Unfortunately, many real-world CL scenarios have no clear task nor context boundaries, motivating the study of task-agnostic CL, where neither the specific tasks nor their switches are known both in training and testing. This paper proposes a variational architecture growing framework dubbed VariGrow. By interpreting dynamically growing neural networks as a Bayesian approximation, and defining flexible implicit variational distributions, VariGrow detects if a new task is arriving through an energy-based novelty score. If the novelty score is high and the sample is \u201cdetected\" as a new task, VariGrow will grow a new expert module to be responsible for it. Otherwise, the sample will be assigned to one of the existing experts who is most \u201cfamiliar\" with it (i.e., one with the lowest novelty score). We have tested VariGrow on several CIFAR and ImageNet-based benchmarks for the strict task-agnostic CL setting and demonstrate its consistent superior performance. Perhaps surprisingly, its performance can even be competitive compared to task-aware methods.", "bibtex": "@InProceedings{pmlr-v162-ardywibowo22a,\n title = \t {{V}ari{G}row: Variational Architecture Growing for Task-Agnostic Continual Learning based on {B}ayesian Novelty},\n author = {Ardywibowo, Randy and Huo, Zepeng and Wang, Zhangyang and Mortazavi, Bobak J and Huang, Shuai and Qian, Xiaoning},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {865--877},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ardywibowo22a/ardywibowo22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ardywibowo22a.html},\n abstract = \t {Continual Learning (CL) is the problem of sequentially learning a set of tasks and preserving all the knowledge acquired. Many existing methods assume that the data stream is explicitly divided into a sequence of known contexts (tasks), and use this information to know when to transfer knowledge from one context to another. Unfortunately, many real-world CL scenarios have no clear task nor context boundaries, motivating the study of task-agnostic CL, where neither the specific tasks nor their switches are known both in training and testing. This paper proposes a variational architecture growing framework dubbed VariGrow. By interpreting dynamically growing neural networks as a Bayesian approximation, and defining flexible implicit variational distributions, VariGrow detects if a new task is arriving through an energy-based novelty score. If the novelty score is high and the sample is \u201cdetected\" as a new task, VariGrow will grow a new expert module to be responsible for it. Otherwise, the sample will be assigned to one of the existing experts who is most \u201cfamiliar\" with it (i.e., one with the lowest novelty score). We have tested VariGrow on several CIFAR and ImageNet-based benchmarks for the strict task-agnostic CL setting and demonstrate its consistent superior performance. Perhaps surprisingly, its performance can even be competitive compared to task-aware methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/ardywibowo22a/ardywibowo22a.pdf", "supp": "", "pdf_size": 579136, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9078825771714249634&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Texas A&M University; Texas A&M University; University of Texas; Texas A&M University; University of Washington; Texas A&M University", "aff_domain": "tamu.edu; ; ; ; ; ", "email": "tamu.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/ardywibowo22a.html", "aff_unique_index": "0;0;1;0;2;0", "aff_unique_norm": "Texas A&M University;University of Texas;University of Washington", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tamu.edu;https://www.utexas.edu;https://www.washington.edu", "aff_unique_abbr": "TAMU;UT;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Variational Feature Pyramid Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17887", "id": "17887", "proceeding": "https://proceedings.mlr.press/v162/dimitrakopoulos22a.html", "poster": "/media/PosterPDFs/ICML%202022/f12f2b34a0c3174269c19e21c07dee68.png?t=1657273548.417688", "slides": "", "author_site": "PANAGIOTIS DIMITRAKOPOULOS, Giorgos Sfikas, CHRISTOPHOROS NIKOU", "author": "Panagiotis Dimitrakopoulos; Giorgos Sfikas; Christophoros Nikou", "abstract": "Recent architectures for object detection adopt a Feature Pyramid Network as a backbone for deep feature extraction. Many works focus on the design of pyramid networks which produce richer feature representations. In this work, we opt to learn a dataset-specific architecture for Feature Pyramid Networks. With the proposed method, the network fuses features at multiple scales, it is efficient in terms of parameters and operations, and yields better results across a variety of tasks and datasets. Starting by a complex network, we adopt Variational Inference to prune redundant connections. Our model, integrated with standard detectors, outperforms the state-of-the-art feature fusion networks.", "bibtex": "@InProceedings{pmlr-v162-dimitrakopoulos22a,\n title = \t {Variational Feature Pyramid Networks},\n author = {Dimitrakopoulos, Panagiotis and Sfikas, Giorgos and Nikou, Christophoros},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5142--5152},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/dimitrakopoulos22a/dimitrakopoulos22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/dimitrakopoulos22a.html},\n abstract = \t {Recent architectures for object detection adopt a Feature Pyramid Network as a backbone for deep feature extraction. Many works focus on the design of pyramid networks which produce richer feature representations. In this work, we opt to learn a dataset-specific architecture for Feature Pyramid Networks. With the proposed method, the network fuses features at multiple scales, it is efficient in terms of parameters and operations, and yields better results across a variety of tasks and datasets. Starting by a complex network, we adopt Variational Inference to prune redundant connections. Our model, integrated with standard detectors, outperforms the state-of-the-art feature fusion networks.}\n}", "pdf": "https://proceedings.mlr.press/v162/dimitrakopoulos22a/dimitrakopoulos22a.pdf", "supp": "", "pdf_size": 3776872, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14476369123412232474&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science and Engineering, University of Ioannina, Ioannina, Greece+University of West At- tica, Athens, Greece+National Center for Scienti\ufb01c Research \u201cDemokritos\u201d, Athens, Greece; Department of Computer Science and Engineering, University of Ioannina, Ioannina, Greece+University of West At- tica, Athens, Greece+National Center for Scienti\ufb01c Research \u201cDemokritos\u201d, Athens, Greece; Department of Computer Science and Engineering, University of Ioannina, Ioannina, Greece", "aff_domain": "uoi.gr;cse.uoi.gr; ", "email": "uoi.gr;cse.uoi.gr; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/dimitrakopoulos22a.html", "aff_unique_index": "0+1+2;0+1+2;0", "aff_unique_norm": "University of Ioannina;University of West Attica;National Center for Scientific Research 'Demokritos'", "aff_unique_dep": "Department of Computer Science and Engineering;;", "aff_unique_url": "https://www.uoi.gr;;https://www.demokritos.gr", "aff_unique_abbr": ";;NCSR 'Demokritos'", "aff_campus_unique_index": "0+1+1;0+1+1;0", "aff_campus_unique": "Ioannina;Athens", "aff_country_unique_index": "0+0+0;0+0+0;0", "aff_country_unique": "Greece" }, { "title": "Variational Inference for Infinitely Deep Neural Networks", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17237", "id": "17237", "proceeding": "https://proceedings.mlr.press/v162/nazaret22a.html", "poster": "/media/PosterPDFs/ICML%202022/5fc34ed307aac159a30d81181c99847e.png?t=1658291922.6806993", "slides": "", "author_site": "Achille Nazaret, David Blei", "author": "Achille Nazaret; David Blei", "abstract": "We introduce the unbounded depth neural network (UDN), an infinitely deep probabilistic model that adapts its complexity to the training data. The UDN contains an infinite sequence of hidden layers and places an unbounded prior on a truncation L, the layer from which it produces its data. Given a dataset of observations, the posterior UDN provides a conditional distribution of both the parameters of the infinite neural network and its truncation. We develop a novel variational inference algorithm to approximate this posterior, optimizing a distribution of the neural network weights and of the truncation depth L, and without any upper limit on L. To this end, the variational family has a special structure: it models neural network weights of arbitrary depth, and it dynamically creates or removes free variational parameters as its distribution of the truncation is optimized. (Unlike heuristic approaches to model search, it is solely through gradient-based optimization that this algorithm explores the space of truncations.) We study the UDN on real and synthetic data. We find that the UDN adapts its posterior depth to the dataset complexity; it outperforms standard neural networks of similar computational complexity; and it outperforms other approaches to infinite-depth neural networks.", "bibtex": "@InProceedings{pmlr-v162-nazaret22a,\n title = \t {Variational Inference for Infinitely Deep Neural Networks},\n author = {Nazaret, Achille and Blei, David},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {16447--16461},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/nazaret22a/nazaret22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/nazaret22a.html},\n abstract = \t {We introduce the unbounded depth neural network (UDN), an infinitely deep probabilistic model that adapts its complexity to the training data. The UDN contains an infinite sequence of hidden layers and places an unbounded prior on a truncation L, the layer from which it produces its data. Given a dataset of observations, the posterior UDN provides a conditional distribution of both the parameters of the infinite neural network and its truncation. We develop a novel variational inference algorithm to approximate this posterior, optimizing a distribution of the neural network weights and of the truncation depth L, and without any upper limit on L. To this end, the variational family has a special structure: it models neural network weights of arbitrary depth, and it dynamically creates or removes free variational parameters as its distribution of the truncation is optimized. (Unlike heuristic approaches to model search, it is solely through gradient-based optimization that this algorithm explores the space of truncations.) We study the UDN on real and synthetic data. We find that the UDN adapts its posterior depth to the dataset complexity; it outperforms standard neural networks of similar computational complexity; and it outperforms other approaches to infinite-depth neural networks.}\n}", "pdf": "https://proceedings.mlr.press/v162/nazaret22a/nazaret22a.pdf", "supp": "", "pdf_size": 6266652, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15923008707496019552&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, Columbia University, New York, USA+Department of Statistics, Columbia University, New York, USA; Department of Computer Science, Columbia University, New York, USA+Department of Statistics, Columbia University, New York, USA", "aff_domain": "columbia.edu; ", "email": "columbia.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/nazaret22a.html", "aff_unique_index": "0+0;0+0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "New York", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "United States" }, { "title": "Variational Inference with Locally Enhanced Bounds for Hierarchical Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17093", "id": "17093", "proceeding": "https://proceedings.mlr.press/v162/geffner22a.html", "poster": "/media/PosterPDFs/ICML%202022/33853141e0873909be88f5c3e6144cc6.png?t=1657576381.9484344", "slides": "", "author_site": "Tomas Geffner, Justin Domke", "author": "Tomas Geffner; Justin Domke", "abstract": "Hierarchical models represent a challenging setting for inference algorithms. MCMC methods struggle to scale to large models with many local variables and observations, and variational inference (VI) may fail to provide accurate approximations due to the use of simple variational families. Some variational methods (e.g. importance weighted VI) integrate Monte Carlo methods to give better accuracy, but these tend to be unsuitable for hierarchical models, as they do not allow for subsampling and their performance tends to degrade for high dimensional models. We propose a new family of variational bounds for hierarchical models, based on the application of tightening methods (e.g. importance weighting) separately for each group of local random variables. We show that our approach naturally allows the use of subsampling to get unbiased gradients, and that it fully leverages the power of methods that build tighter lower bounds by applying them independently in lower dimensional spaces, leading to better results and more accurate posterior approximations than relevant baselines.", "bibtex": "@InProceedings{pmlr-v162-geffner22a,\n title = \t {Variational Inference with Locally Enhanced Bounds for Hierarchical Models},\n author = {Geffner, Tomas and Domke, Justin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7310--7323},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/geffner22a/geffner22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/geffner22a.html},\n abstract = \t {Hierarchical models represent a challenging setting for inference algorithms. MCMC methods struggle to scale to large models with many local variables and observations, and variational inference (VI) may fail to provide accurate approximations due to the use of simple variational families. Some variational methods (e.g. importance weighted VI) integrate Monte Carlo methods to give better accuracy, but these tend to be unsuitable for hierarchical models, as they do not allow for subsampling and their performance tends to degrade for high dimensional models. We propose a new family of variational bounds for hierarchical models, based on the application of tightening methods (e.g. importance weighting) separately for each group of local random variables. We show that our approach naturally allows the use of subsampling to get unbiased gradients, and that it fully leverages the power of methods that build tighter lower bounds by applying them independently in lower dimensional spaces, leading to better results and more accurate posterior approximations than relevant baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/geffner22a/geffner22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/geffner22a-supp.zip", "pdf_size": 8447171, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=504014592023357238&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": ";", "aff_domain": ";", "email": ";", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/geffner22a.html" }, { "title": "Variational Mixtures of ODEs for Inferring Cellular Gene Expression Dynamics", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16629", "id": "16629", "proceeding": "https://proceedings.mlr.press/v162/gu22a.html", "poster": "/media/PosterPDFs/ICML%202022/c7b03782920d35145eb4c97556d194a3.png?t=1657423901.6239967", "slides": "", "author_site": "Yichen Gu, DAVID BLAAUW, Joshua Welch", "author": "Yichen Gu; David T Blaauw; Joshua Welch", "abstract": "A key problem in computational biology is discovering the gene expression changes that regulate cell fate transitions, in which one cell type turns into another. However, each individual cell cannot be tracked longitudinally, and cells at the same point in real time may be at different stages of the transition process. This can be viewed as a problem of learning the behavior of a dynamical system from observations whose times are unknown. Additionally, a single progenitor cell type often bifurcates into multiple child cell types, further complicating the problem of modeling the dynamics. To address this problem, we developed an approach called variational mixtures of ordinary differential equations. By using a simple family of ODEs informed by the biochemistry of gene expression to constrain the likelihood of a deep generative model, we can simultaneously infer the latent time and latent state of each cell and predict its future gene expression state. The model can be interpreted as a mixture of ODEs whose parameters vary continuously across a latent space of cell states. Our approach dramatically improves data fit, latent time inference, and future cell state estimation of single-cell gene expression data compared to previous approaches.", "bibtex": "@InProceedings{pmlr-v162-gu22a,\n title = \t {Variational Mixtures of {ODE}s for Inferring Cellular Gene Expression Dynamics},\n author = {Gu, Yichen and Blaauw, David T and Welch, Joshua},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {7887--7901},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/gu22a/gu22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/gu22a.html},\n abstract = \t {A key problem in computational biology is discovering the gene expression changes that regulate cell fate transitions, in which one cell type turns into another. However, each individual cell cannot be tracked longitudinally, and cells at the same point in real time may be at different stages of the transition process. This can be viewed as a problem of learning the behavior of a dynamical system from observations whose times are unknown. Additionally, a single progenitor cell type often bifurcates into multiple child cell types, further complicating the problem of modeling the dynamics. To address this problem, we developed an approach called variational mixtures of ordinary differential equations. By using a simple family of ODEs informed by the biochemistry of gene expression to constrain the likelihood of a deep generative model, we can simultaneously infer the latent time and latent state of each cell and predict its future gene expression state. The model can be interpreted as a mixture of ODEs whose parameters vary continuously across a latent space of cell states. Our approach dramatically improves data fit, latent time inference, and future cell state estimation of single-cell gene expression data compared to previous approaches.}\n}", "pdf": "https://proceedings.mlr.press/v162/gu22a/gu22a.pdf", "supp": "", "pdf_size": 6655413, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5570506012304975998&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Electrical Engineering and Computer Science, University of Michigan, Ann Arbor, United States+Department of Computational Medicine and Bioinformatics, University of Michigan, Ann Arbor, United States; Department of Electrical Engineering and Computer Science, University of Michigan, Ann Arbor, United States+Department of Computational Medicine and Bioinformatics, University of Michigan, Ann Arbor, United States; Department of Electrical Engineering and Computer Science, University of Michigan, Ann Arbor, United States+Department of Computational Medicine and Bioinformatics, University of Michigan, Ann Arbor, United States", "aff_domain": "umich.edu;umich.edu; ", "email": "umich.edu;umich.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/gu22a.html", "aff_unique_index": "0+0;0+0;0+0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "Department of Electrical Engineering and Computer Science", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Ann Arbor", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "Variational On-the-Fly Personalization", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16109", "id": "16109", "proceeding": "https://proceedings.mlr.press/v162/kim22e.html", "poster": "/media/PosterPDFs/ICML%202022/cd61a580392a70389e27b0bc2b439f49.png?t=1658020645.147546", "slides": "/media/icml-2022/Slides/16109.pdf", "author_site": "Jangho Kim, Jun-Tae Lee, Simyung Chang, NOJUN KWAK", "author": "Jangho Kim; Jun-Tae Lee; Simyung Chang; Nojun Kwak", "abstract": "With the development of deep learning (DL) technologies, the demand for DL-based services on personal devices, such as mobile phones, also increases rapidly. In this paper, we propose a novel personalization method, Variational On-the-Fly Personalization. Compared to the conventional personalization methods that require additional fine-tuning with personal data, the proposed method only requires forwarding a handful of personal data on-the-fly. Assuming even a single personal data can convey the characteristics of a target person, we develop the variational hyper-personalizer to capture the weight distribution of layers that fits the target person. In the testing phase, the hyper-personalizer estimates the model\u2019s weights on-the-fly based on personality by forwarding only a small amount of (even a single) personal enrollment data. Hence, the proposed method can perform the personalization without any training software platform and additional cost in the edge device. In experiments, we show our approach can effectively generate reliable personalized models via forwarding (not back-propagating) a handful of samples.", "bibtex": "@InProceedings{pmlr-v162-kim22e,\n title = \t {Variational On-the-Fly Personalization},\n author = {Kim, Jangho and Lee, Jun-Tae and Chang, Simyung and Kwak, Nojun},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11134--11147},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kim22e/kim22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/kim22e.html},\n abstract = \t {With the development of deep learning (DL) technologies, the demand for DL-based services on personal devices, such as mobile phones, also increases rapidly. In this paper, we propose a novel personalization method, Variational On-the-Fly Personalization. Compared to the conventional personalization methods that require additional fine-tuning with personal data, the proposed method only requires forwarding a handful of personal data on-the-fly. Assuming even a single personal data can convey the characteristics of a target person, we develop the variational hyper-personalizer to capture the weight distribution of layers that fits the target person. In the testing phase, the hyper-personalizer estimates the model\u2019s weights on-the-fly based on personality by forwarding only a small amount of (even a single) personal enrollment data. Hence, the proposed method can perform the personalization without any training software platform and additional cost in the edge device. In experiments, we show our approach can effectively generate reliable personalized models via forwarding (not back-propagating) a handful of samples.}\n}", "pdf": "https://proceedings.mlr.press/v162/kim22e/kim22e.pdf", "supp": "", "pdf_size": 1166289, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2963772835076607567&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Qualcomm AI Research, an initiative of Qualcomm Technologies, Inc. + Seoul National University; Qualcomm AI Research, an initiative of Qualcomm Technologies, Inc.; Qualcomm AI Research, an initiative of Qualcomm Technologies, Inc.; Seoul National University", "aff_domain": "snu.ac.kr;qti.qualcomm.com;qti.qualcomm.com;snu.ac.kr", "email": "snu.ac.kr;qti.qualcomm.com;qti.qualcomm.com;snu.ac.kr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/kim22e.html", "aff_unique_index": "0+1;0;0;1", "aff_unique_norm": "Qualcomm Technologies, Inc.;Seoul National University", "aff_unique_dep": "Qualcomm AI Research;", "aff_unique_url": "https://www.qualcomm.com/research;https://www.snu.ac.kr", "aff_unique_abbr": "Qualcomm;SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;0;0;1", "aff_country_unique": "United States;South Korea" }, { "title": "Variational Sparse Coding with Learned Thresholding", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16429", "id": "16429", "proceeding": "https://proceedings.mlr.press/v162/fallah22a.html", "poster": "/media/PosterPDFs/ICML%202022/e9510081ac30ffa83f10b68cde1cac07_MkukYer.png?t=1657664003.8744752", "slides": "", "author_site": "Kion Fallah, Christopher Rozell", "author": "Kion Fallah; Christopher J Rozell", "abstract": "Sparse coding strategies have been lauded for their parsimonious representations of data that leverage low dimensional structure. However, inference of these codes typically relies on an optimization procedure with poor computational scaling in high-dimensional problems. For example, sparse inference in the representations learned in the high-dimensional intermediary layers of deep neural networks (DNNs) requires an iterative minimization to be performed at each training step. As such, recent, quick methods in variational inference have been proposed to infer sparse codes by learning a distribution over the codes with a DNN. In this work, we propose a new approach to variational sparse coding that allows us to learn sparse distributions by thresholding samples, avoiding the use of problematic relaxations. We first evaluate and analyze our method by training a linear generator, showing that it has superior performance, statistical efficiency, and gradient estimation compared to other sparse distributions. We then compare to a standard variational autoencoder using a DNN generator on the CelebA dataset.", "bibtex": "@InProceedings{pmlr-v162-fallah22a,\n title = \t {Variational Sparse Coding with Learned Thresholding},\n author = {Fallah, Kion and Rozell, Christopher J},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6034--6058},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fallah22a/fallah22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/fallah22a.html},\n abstract = \t {Sparse coding strategies have been lauded for their parsimonious representations of data that leverage low dimensional structure. However, inference of these codes typically relies on an optimization procedure with poor computational scaling in high-dimensional problems. For example, sparse inference in the representations learned in the high-dimensional intermediary layers of deep neural networks (DNNs) requires an iterative minimization to be performed at each training step. As such, recent, quick methods in variational inference have been proposed to infer sparse codes by learning a distribution over the codes with a DNN. In this work, we propose a new approach to variational sparse coding that allows us to learn sparse distributions by thresholding samples, avoiding the use of problematic relaxations. We first evaluate and analyze our method by training a linear generator, showing that it has superior performance, statistical efficiency, and gradient estimation compared to other sparse distributions. We then compare to a standard variational autoencoder using a DNN generator on the CelebA dataset.}\n}", "pdf": "https://proceedings.mlr.press/v162/fallah22a/fallah22a.pdf", "supp": "", "pdf_size": 7849548, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10401057138019982209&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "ML@GT, Georgia Institute of Technology, Atlanta, Georgia; ML@GT, Georgia Institute of Technology, Atlanta, Georgia", "aff_domain": "gatech.edu; ", "email": "gatech.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/fallah22a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "ML@GT", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Atlanta", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Variational Wasserstein gradient flow", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16823", "id": "16823", "proceeding": "https://proceedings.mlr.press/v162/fan22d.html", "poster": "/media/PosterPDFs/ICML%202022/20d135f0f28185b84a4cf7aa51f29500_3NdB0tY.png?t=1658687099.8011887", "slides": "/media/icml-2022/Slides/16823.pdf", "author_site": "Jiaojiao Fan, Qinsheng Zhang, Amirhossein Taghvaei, Yongxin Chen", "author": "Jiaojiao Fan; Qinsheng Zhang; Amirhossein Taghvaei; Yongxin Chen", "abstract": "Wasserstein gradient flow has emerged as a promising approach to solve optimization problems over the space of probability distributions. A recent trend is to use the well-known JKO scheme in combination with input convex neural networks to numerically implement the proximal step. The most challenging step, in this setup, is to evaluate functions involving density explicitly, such as entropy, in terms of samples. This paper builds on the recent works with a slight but crucial difference: we propose to utilize a variational formulation of the objective function formulated as maximization over a parametric class of functions. Theoretically, the proposed variational formulation allows the construction of gradient flows directly for empirical distributions with a well-defined and meaningful objective function. Computationally, this approach replaces the computationally expensive step in existing methods, to handle objective functions involving density, with inner loop updates that only require a small batch of samples and scale well with the dimension. The performance and scalability of the proposed method are illustrated with the aid of several numerical experiments involving high-dimensional synthetic and real datasets.", "bibtex": "@InProceedings{pmlr-v162-fan22d,\n title = \t {Variational {W}asserstein gradient flow},\n author = {Fan, Jiaojiao and Zhang, Qinsheng and Taghvaei, Amirhossein and Chen, Yongxin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6185--6215},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fan22d/fan22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/fan22d.html},\n abstract = \t {Wasserstein gradient flow has emerged as a promising approach to solve optimization problems over the space of probability distributions. A recent trend is to use the well-known JKO scheme in combination with input convex neural networks to numerically implement the proximal step. The most challenging step, in this setup, is to evaluate functions involving density explicitly, such as entropy, in terms of samples. This paper builds on the recent works with a slight but crucial difference: we propose to utilize a variational formulation of the objective function formulated as maximization over a parametric class of functions. Theoretically, the proposed variational formulation allows the construction of gradient flows directly for empirical distributions with a well-defined and meaningful objective function. Computationally, this approach replaces the computationally expensive step in existing methods, to handle objective functions involving density, with inner loop updates that only require a small batch of samples and scale well with the dimension. The performance and scalability of the proposed method are illustrated with the aid of several numerical experiments involving high-dimensional synthetic and real datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/fan22d/fan22d.pdf", "supp": "", "pdf_size": 12307945, "gs_citation": 74, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4247639090058922494&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Georgia Institute of Technology; Georgia Institute of Technology; University of Washington, Seattle; Georgia Institute of Technology", "aff_domain": "gatech.edu; ; ; ", "email": "gatech.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/fan22d.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Georgia Institute of Technology;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.gatech.edu;https://www.washington.edu", "aff_unique_abbr": "Georgia Tech;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Variational nearest neighbor Gaussian process", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17619", "id": "17619", "proceeding": "https://proceedings.mlr.press/v162/wu22h.html", "poster": "/media/PosterPDFs/ICML%202022/c8dfece5cc68249206e4690fc4737a8d_4D775m8.png?t=1657851270.365615", "slides": "", "author_site": "Luhuan Wu, Geoff Pleiss, John Cunningham", "author": "Luhuan Wu; Geoff Pleiss; John P Cunningham", "abstract": "Variational approximations to Gaussian processes (GPs) typically use a small set of inducing points to form a low-rank approximation to the covariance matrix. In this work, we instead exploit a sparse approximation of the precision matrix. We propose variational nearest neighbor Gaussian process (VNNGP), which introduces a prior that only retains correlations within $K$ nearest-neighboring observations, thereby inducing sparse precision structure. Using the variational framework, VNNGP\u2019s objective can be factorized over both observations and inducing points, enabling stochastic optimization with a time complexity of $O(K^3)$. Hence, we can arbitrarily scale the inducing point size, even to the point of putting inducing points at every observed location. We compare VNNGP to other scalable GPs through various experiments, and demonstrate that VNNGP (1) can dramatically outperform low-rank methods, and (2) is less prone to overfitting than other nearest neighbor methods.", "bibtex": "@InProceedings{pmlr-v162-wu22h,\n title = \t {Variational nearest neighbor {G}aussian process},\n author = {Wu, Luhuan and Pleiss, Geoff and Cunningham, John P},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {24114--24130},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wu22h/wu22h.pdf},\n url = \t {https://proceedings.mlr.press/v162/wu22h.html},\n abstract = \t {Variational approximations to Gaussian processes (GPs) typically use a small set of inducing points to form a low-rank approximation to the covariance matrix. In this work, we instead exploit a sparse approximation of the precision matrix. We propose variational nearest neighbor Gaussian process (VNNGP), which introduces a prior that only retains correlations within $K$ nearest-neighboring observations, thereby inducing sparse precision structure. Using the variational framework, VNNGP\u2019s objective can be factorized over both observations and inducing points, enabling stochastic optimization with a time complexity of $O(K^3)$. Hence, we can arbitrarily scale the inducing point size, even to the point of putting inducing points at every observed location. We compare VNNGP to other scalable GPs through various experiments, and demonstrate that VNNGP (1) can dramatically outperform low-rank methods, and (2) is less prone to overfitting than other nearest neighbor methods.}\n}", "pdf": "https://proceedings.mlr.press/v162/wu22h/wu22h.pdf", "supp": "", "pdf_size": 577742, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5142914812392801011&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Statistics, Columbia University + Zuckerman Institute, Columbia University; Zuckerman Institute, Columbia University; Department of Statistics, Columbia University + Zuckerman Institute, Columbia University", "aff_domain": "columbia.edu; ; ", "email": "columbia.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wu22h.html", "aff_unique_index": "0+0;0;0+0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0", "aff_country_unique": "United States" }, { "title": "Versatile Dueling Bandits: Best-of-both World Analyses for Learning from Relative Preferences", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18325", "id": "18325", "proceeding": "https://proceedings.mlr.press/v162/saha22a.html", "poster": "/media/PosterPDFs/ICML%202022/4c4e5249f45d8cf6a0387c58ac13f514.png?t=1657562854.9310868", "slides": "", "author_site": "Aadirupa Saha, Pierre Gaillard", "author": "Aadirupa Saha; Pierre Gaillard", "abstract": "We study the problem of $K$-armed dueling bandit for both stochastic and adversarial environments, where the goal of the learner is to aggregate information through relative preferences of pair of decision points queried in an online sequential manner. We first propose a novel reduction from any (general) dueling bandits to multi-armed bandits which allows us to improve many existing results in dueling bandits. In particular,", "bibtex": "@InProceedings{pmlr-v162-saha22a,\n title = \t {Versatile Dueling Bandits: Best-of-both World Analyses for Learning from Relative Preferences},\n author = {Saha, Aadirupa and Gaillard, Pierre},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {19011--19026},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/saha22a/saha22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/saha22a.html},\n abstract = \t {We study the problem of $K$-armed dueling bandit for both stochastic and adversarial environments, where the goal of the learner is to aggregate information through relative preferences of pair of decision points queried in an online sequential manner. We first propose a novel reduction from any (general) dueling bandits to multi-armed bandits which allows us to improve many existing results in dueling bandits. In particular,", "pdf": "https://proceedings.mlr.press/v162/saha22a/saha22a.pdf", "supp": "", "pdf_size": 2618020, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18333855710411646141&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Microsoft Research, NYC, US; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France", "aff_domain": "gmail.com; ", "email": "gmail.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/saha22a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Microsoft;Universite Grenoble Alpes", "aff_unique_dep": "Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.univ-grenoble-alpes.fr", "aff_unique_abbr": "MSR;UGA", "aff_campus_unique_index": "0;1", "aff_campus_unique": "New York City;Grenoble", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;France" }, { "title": "Versatile Offline Imitation from Observations and Examples via Regularized State-Occupancy Matching", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16597", "id": "16597", "proceeding": "https://proceedings.mlr.press/v162/ma22a.html", "poster": "/media/PosterPDFs/ICML%202022/9f1f64b519d20e2ccc36e1589a8f7555.png?t=1657376786.2064795", "slides": "", "author_site": "Jason Yecheng Ma, Andrew Shen, Dinesh Jayaraman, Osbert Bastani", "author": "Yecheng Ma; Andrew Shen; Dinesh Jayaraman; Osbert Bastani", "abstract": "We propose State Matching Offline DIstribution Correction Estimation (SMODICE), a novel and versatile regression-based offline imitation learning algorithm derived via state-occupancy matching. We show that the SMODICE objective admits a simple optimization procedure through an application of Fenchel duality and an analytic solution in tabular MDPs. Without requiring access to expert actions, SMODICE can be effectively applied to three offline IL settings: (i) imitation from observations (IfO), (ii) IfO with dynamics or morphologically mismatched expert, and (iii) example-based reinforcement learning, which we show can be formulated as a state-occupancy matching problem. We extensively evaluate SMODICE on both gridworld environments as well as on high-dimensional offline benchmarks. Our results demonstrate that SMODICE is effective for all three problem settings and significantly outperforms prior state-of-art.", "bibtex": "@InProceedings{pmlr-v162-ma22a,\n title = \t {Versatile Offline Imitation from Observations and Examples via Regularized State-Occupancy Matching},\n author = {Ma, Yecheng and Shen, Andrew and Jayaraman, Dinesh and Bastani, Osbert},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14639--14663},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ma22a/ma22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ma22a.html},\n abstract = \t {We propose State Matching Offline DIstribution Correction Estimation (SMODICE), a novel and versatile regression-based offline imitation learning algorithm derived via state-occupancy matching. We show that the SMODICE objective admits a simple optimization procedure through an application of Fenchel duality and an analytic solution in tabular MDPs. Without requiring access to expert actions, SMODICE can be effectively applied to three offline IL settings: (i) imitation from observations (IfO), (ii) IfO with dynamics or morphologically mismatched expert, and (iii) example-based reinforcement learning, which we show can be formulated as a state-occupancy matching problem. We extensively evaluate SMODICE on both gridworld environments as well as on high-dimensional offline benchmarks. Our results demonstrate that SMODICE is effective for all three problem settings and significantly outperforms prior state-of-art.}\n}", "pdf": "https://proceedings.mlr.press/v162/ma22a/ma22a.pdf", "supp": "", "pdf_size": 2342222, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11179690746522153663&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer and Information Science, University of Pennsylvania, Philadelphia, USA+1; University of Melbourne, Melbourne, Australia+2; Department of Computer and Information Science, University of Pennsylvania, Philadelphia, USA; Department of Computer and Information Science, University of Pennsylvania, Philadelphia, USA", "aff_domain": "seas.upenn.edu; ; ; ", "email": "seas.upenn.edu; ; ; ", "github": "", "project": "https://sites.google.com/view/smodice/home", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/ma22a.html", "aff_unique_index": "0;2;0;0", "aff_unique_norm": "University of Pennsylvania;;University of Melbourne", "aff_unique_dep": "Department of Computer and Information Science;;", "aff_unique_url": "https://www.upenn.edu;;https://www.unimelb.edu.au", "aff_unique_abbr": "UPenn;;UniMelb", "aff_campus_unique_index": "0;2;0;0", "aff_campus_unique": "Philadelphia;;Melbourne", "aff_country_unique_index": "0;2;0;0", "aff_country_unique": "United States;;Australia" }, { "title": "ViT-NeT: Interpretable Vision Transformers with Neural Tree Decoder", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16525", "id": "16525", "proceeding": "https://proceedings.mlr.press/v162/kim22g.html", "poster": "/media/PosterPDFs/ICML%202022/3e15cc11f979ed25912dff5b0669f2cd.png?t=1657526260.3052948", "slides": "", "author_site": "Sangwon Kim, Jaeyeal Nam, Byoung Chul Ko", "author": "Sangwon Kim; Jaeyeal Nam; Byoung Chul Ko", "abstract": "Vision transformers (ViTs), which have demonstrated a state-of-the-art performance in image classification, can also visualize global interpretations through attention-based contributions. However, the complexity of the model makes it difficult to interpret the decision-making process, and the ambiguity of the attention maps can cause incorrect correlations between image patches. In this study, we propose a new ViT neural tree decoder (ViT-NeT). A ViT acts as a backbone, and to solve its limitations, the output contextual image patches are applied to the proposed NeT. The NeT aims to accurately classify fine-grained objects with similar inter-class correlations and different intra-class correlations. In addition, it describes the decision-making process through a tree structure and prototype and enables a visual interpretation of the results. The proposed ViT-NeT is designed to not only improve the classification performance but also provide a human-friendly interpretation, which is effective in resolving the trade-off between performance and interpretability. We compared the performance of ViT-NeT with other state-of-art methods using widely used fine-grained visual categorization benchmark datasets and experimentally proved that the proposed method is superior in terms of the classification performance and interpretability. The code and models are publicly available at https://github.com/jumpsnack/ViT-NeT.", "bibtex": "@InProceedings{pmlr-v162-kim22g,\n title = \t {{V}i{T}-{N}e{T}: Interpretable Vision Transformers with Neural Tree Decoder},\n author = {Kim, Sangwon and Nam, Jaeyeal and Ko, Byoung Chul},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {11162--11172},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/kim22g/kim22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/kim22g.html},\n abstract = \t {Vision transformers (ViTs), which have demonstrated a state-of-the-art performance in image classification, can also visualize global interpretations through attention-based contributions. However, the complexity of the model makes it difficult to interpret the decision-making process, and the ambiguity of the attention maps can cause incorrect correlations between image patches. In this study, we propose a new ViT neural tree decoder (ViT-NeT). A ViT acts as a backbone, and to solve its limitations, the output contextual image patches are applied to the proposed NeT. The NeT aims to accurately classify fine-grained objects with similar inter-class correlations and different intra-class correlations. In addition, it describes the decision-making process through a tree structure and prototype and enables a visual interpretation of the results. The proposed ViT-NeT is designed to not only improve the classification performance but also provide a human-friendly interpretation, which is effective in resolving the trade-off between performance and interpretability. We compared the performance of ViT-NeT with other state-of-art methods using widely used fine-grained visual categorization benchmark datasets and experimentally proved that the proposed method is superior in terms of the classification performance and interpretability. The code and models are publicly available at https://github.com/jumpsnack/ViT-NeT.}\n}", "pdf": "https://proceedings.mlr.press/v162/kim22g/kim22g.pdf", "supp": "", "pdf_size": 6943064, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7284110818114269396&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer Engineering, Keimyung University, Daegu, South Korea; Department of Computer Engineering, Keimyung University, Daegu, South Korea; Department of Computer Engineering, Keimyung University, Daegu, South Korea", "aff_domain": "kmu.ac.kr; ; ", "email": "kmu.ac.kr; ; ", "github": "https://github.com/jumpsnack/ViT-NeT", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/kim22g.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Keimyung University", "aff_unique_dep": "Department of Computer Engineering", "aff_unique_url": "http://www.keimyung.ac.kr", "aff_unique_abbr": "KMU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Daegu", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Virtual Homogeneity Learning: Defending against Data Heterogeneity in Federated Learning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18205", "id": "18205", "proceeding": "https://proceedings.mlr.press/v162/tang22d.html", "poster": "/media/PosterPDFs/ICML%202022/e8f2779682fd11fa2067beffc27a9192.png?t=1656573636.821033", "slides": "", "author_site": "Zhenheng Tang, Yonggang Zhang, Shaohuai Shi, Xin He, Bo Han, Xiaowen Chu", "author": "Zhenheng Tang; Yonggang Zhang; Shaohuai Shi; Xin He; Bo Han; Xiaowen Chu", "abstract": "In federated learning (FL), model performance typically suffers from client drift induced by data heterogeneity, and mainstream works focus on correcting client drift. We propose a different approach named virtual homogeneity learning (VHL) to directly \u201crectify\u201d the data heterogeneity. In particular, VHL conducts FL with a virtual homogeneous dataset crafted to satisfy two conditions: containing", "bibtex": "@InProceedings{pmlr-v162-tang22d,\n title = \t {Virtual Homogeneity Learning: Defending against Data Heterogeneity in Federated Learning},\n author = {Tang, Zhenheng and Zhang, Yonggang and Shi, Shaohuai and He, Xin and Han, Bo and Chu, Xiaowen},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {21111--21132},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/tang22d/tang22d.pdf},\n url = \t {https://proceedings.mlr.press/v162/tang22d.html},\n abstract = \t {In federated learning (FL), model performance typically suffers from client drift induced by data heterogeneity, and mainstream works focus on correcting client drift. We propose a different approach named virtual homogeneity learning (VHL) to directly \u201crectify\u201d the data heterogeneity. In particular, VHL conducts FL with a virtual homogeneous dataset crafted to satisfy two conditions: containing", "pdf": "https://proceedings.mlr.press/v162/tang22d/tang22d.pdf", "supp": "", "pdf_size": 5632460, "gs_citation": 111, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5551753342557173221&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science, Hong Kong Baptist University; Department of Computer Science, Hong Kong Baptist University; Department of Computer Science and Engineering, The Hong Kong University of Science and Technology; Department of Computer Science, Hong Kong Baptist University; Department of Computer Science, Hong Kong Baptist University; Data Science and Analytics Thrust, The Hong Kong University of Science and Technology (Guangzhou)", "aff_domain": "ust.hk; ; ; ; ;ust.hk", "email": "ust.hk; ; ; ; ;ust.hk", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/tang22d.html", "aff_unique_index": "0;0;1;0;0;1", "aff_unique_norm": "Hong Kong Baptist University;Hong Kong University of Science and Technology", "aff_unique_dep": "Department of Computer Science;Department of Computer Science and Engineering", "aff_unique_url": "https://www.hkbu.edu.hk;https://www.ust.hk", "aff_unique_abbr": "HKBU;HKUST", "aff_campus_unique_index": "0;0;0;0;0;1", "aff_campus_unique": "Hong Kong SAR;Guangzhou", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Visual Attention Emerges from Recurrent Sparse Reconstruction", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17269", "id": "17269", "proceeding": "https://proceedings.mlr.press/v162/shi22e.html", "poster": "/media/PosterPDFs/ICML%202022/6395ebd0f4b478145ecfbaf939454fa4.png?t=1657762083.870578", "slides": "", "author_site": "Baifeng Shi, Yale Song, Neel Joshi, Trevor Darrell, Xin Wang", "author": "Baifeng Shi; Yale Song; Neel Joshi; Trevor Darrell; Xin Wang", "abstract": "Visual attention helps achieve robust perception under noise, corruption, and distribution shifts in human vision, which are areas where modern neural networks still fall short. We present VARS, Visual Attention from Recurrent Sparse reconstruction, a new attention formulation built on two prominent features of the human visual attention mechanism: recurrency and sparsity. Related features are grouped together via recurrent connections between neurons, with salient objects emerging via sparse regularization. VARS adopts an attractor network with recurrent connections that converges toward a stable pattern over time. Network layers are represented as ordinary differential equations (ODEs), formulating attention as a recurrent attractor network that equivalently optimizes the sparse reconstruction of input using a dictionary of \u201ctemplates\u201d encoding underlying patterns of data. We show that self-attention is a special case of VARS with a single-step optimization and no sparsity constraint. VARS can be readily used as a replacement for self-attention in popular vision transformers, consistently improving their robustness across various benchmarks.", "bibtex": "@InProceedings{pmlr-v162-shi22e,\n title = \t {Visual Attention Emerges from Recurrent Sparse Reconstruction},\n author = {Shi, Baifeng and Song, Yale and Joshi, Neel and Darrell, Trevor and Wang, Xin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {20041--20056},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/shi22e/shi22e.pdf},\n url = \t {https://proceedings.mlr.press/v162/shi22e.html},\n abstract = \t {Visual attention helps achieve robust perception under noise, corruption, and distribution shifts in human vision, which are areas where modern neural networks still fall short. We present VARS, Visual Attention from Recurrent Sparse reconstruction, a new attention formulation built on two prominent features of the human visual attention mechanism: recurrency and sparsity. Related features are grouped together via recurrent connections between neurons, with salient objects emerging via sparse regularization. VARS adopts an attractor network with recurrent connections that converges toward a stable pattern over time. Network layers are represented as ordinary differential equations (ODEs), formulating attention as a recurrent attractor network that equivalently optimizes the sparse reconstruction of input using a dictionary of \u201ctemplates\u201d encoding underlying patterns of data. We show that self-attention is a special case of VARS with a single-step optimization and no sparsity constraint. VARS can be readily used as a replacement for self-attention in popular vision transformers, consistently improving their robustness across various benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/shi22e/shi22e.pdf", "supp": "", "pdf_size": 11121909, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=626547526031635836&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "University of California, Berkeley; Microsoft Research; Microsoft Research; University of California, Berkeley; Microsoft Research", "aff_domain": "berkeley.edu; ; ; ; ", "email": "berkeley.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/shi22e.html", "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "University of California, Berkeley;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.berkeley.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UC Berkeley;MSR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Volatility Based Kernels and Moving Average Means for Accurate Forecasting with Gaussian Processes", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16873", "id": "16873", "proceeding": "https://proceedings.mlr.press/v162/benton22a.html", "poster": "/media/PosterPDFs/ICML%202022/2107931de60c5a7c5d526bd1d6a8a34d.png?t=1657905801.212777", "slides": "", "author_site": "Gregory Benton, Wesley Maddox, Andrew Wilson", "author": "Gregory Benton; Wesley Maddox; Andrew Gordon Wilson", "abstract": "A broad class of stochastic volatility models are defined by systems of stochastic differential equations, and while these models have seen widespread success in domains such as finance and statistical climatology, they typically lack an ability to condition on historical data to produce a true posterior distribution. To address this fundamental limitation, we show how to re-cast a class of stochastic volatility models as a hierarchical Gaussian process (GP) model with specialized covariance functions. This GP model retains the inductive biases of the stochastic volatility model while providing the posterior predictive distribution given by GP inference. Within this framework, we take inspiration from well studied domains to introduce a new class of models, Volt and Magpie, that significantly outperform baselines in stock and wind speed forecasting, and naturally extend to the multitask setting.", "bibtex": "@InProceedings{pmlr-v162-benton22a,\n title = \t {Volatility Based Kernels and Moving Average Means for Accurate Forecasting with {G}aussian Processes},\n author = {Benton, Gregory and Maddox, Wesley and Wilson, Andrew Gordon},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1798--1816},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/benton22a/benton22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/benton22a.html},\n abstract = \t {A broad class of stochastic volatility models are defined by systems of stochastic differential equations, and while these models have seen widespread success in domains such as finance and statistical climatology, they typically lack an ability to condition on historical data to produce a true posterior distribution. To address this fundamental limitation, we show how to re-cast a class of stochastic volatility models as a hierarchical Gaussian process (GP) model with specialized covariance functions. This GP model retains the inductive biases of the stochastic volatility model while providing the posterior predictive distribution given by GP inference. Within this framework, we take inspiration from well studied domains to introduce a new class of models, Volt and Magpie, that significantly outperform baselines in stock and wind speed forecasting, and naturally extend to the multitask setting.}\n}", "pdf": "https://proceedings.mlr.press/v162/benton22a/benton22a.pdf", "supp": "", "pdf_size": 853367, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=445432332886185125&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "New York University; New York University; New York University", "aff_domain": "nyu.edu; ; ", "email": "nyu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/benton22a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Weisfeiler-Lehman Meets Gromov-Wasserstein", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16497", "id": "16497", "proceeding": "https://proceedings.mlr.press/v162/chen22o.html", "poster": "", "slides": "", "author_site": "Samantha Chen, Sunhyuk Lim, Facundo Memoli, Zhengchao Wan, Yusu Wang", "author": "Samantha Chen; Sunhyuk Lim; Facundo Memoli; Zhengchao Wan; Yusu Wang", "abstract": "The Weisfeiler-Lehman (WL) test is a classical procedure for graph isomorphism testing. The WL test has also been widely used both for designing graph kernels and for analyzing graph neural networks. In this paper, we propose the Weisfeiler-Lehman (WL) distance, a notion of distance between labeled measure Markov chains (LMMCs), of which labeled graphs are special cases. The WL distance is polynomial time computable and is also compatible with the WL test in the sense that the former is positive if and only if the WL test can distinguish the two involved graphs. The WL distance captures and compares subtle structures of the underlying LMMCs and, as a consequence of this, it is more discriminating than the distance between graphs used for defining the state-of-the-art Wasserstein Weisfeiler-Lehman graph kernel. Inspired by the structure of the WL distance we identify a neural network architecture on LMMCs which turns out to be universal w.r.t. continuous functions defined on the space of all LMMCs (which includes all graphs) endowed with the WL distance. Finally, the WL distance turns out to be stable w.r.t. a natural variant of the Gromov-Wasserstein (GW) distance for comparing metric Markov chains that we identify. Hence, the WL distance can also be construed as a polynomial time lower bound for the GW distance which is in general NP-hard to compute.", "bibtex": "@InProceedings{pmlr-v162-chen22o,\n title = \t {Weisfeiler-Lehman Meets Gromov-{W}asserstein},\n author = {Chen, Samantha and Lim, Sunhyuk and Memoli, Facundo and Wan, Zhengchao and Wang, Yusu},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {3371--3416},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/chen22o/chen22o.pdf},\n url = \t {https://proceedings.mlr.press/v162/chen22o.html},\n abstract = \t {The Weisfeiler-Lehman (WL) test is a classical procedure for graph isomorphism testing. The WL test has also been widely used both for designing graph kernels and for analyzing graph neural networks. In this paper, we propose the Weisfeiler-Lehman (WL) distance, a notion of distance between labeled measure Markov chains (LMMCs), of which labeled graphs are special cases. The WL distance is polynomial time computable and is also compatible with the WL test in the sense that the former is positive if and only if the WL test can distinguish the two involved graphs. The WL distance captures and compares subtle structures of the underlying LMMCs and, as a consequence of this, it is more discriminating than the distance between graphs used for defining the state-of-the-art Wasserstein Weisfeiler-Lehman graph kernel. Inspired by the structure of the WL distance we identify a neural network architecture on LMMCs which turns out to be universal w.r.t. continuous functions defined on the space of all LMMCs (which includes all graphs) endowed with the WL distance. Finally, the WL distance turns out to be stable w.r.t. a natural variant of the Gromov-Wasserstein (GW) distance for comparing metric Markov chains that we identify. Hence, the WL distance can also be construed as a polynomial time lower bound for the GW distance which is in general NP-hard to compute.}\n}", "pdf": "https://proceedings.mlr.press/v162/chen22o/chen22o.pdf", "supp": "", "pdf_size": 664469, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3822575618322805369&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/chen22o.html" }, { "title": "Welfare Maximization in Competitive Equilibrium: Reinforcement Learning for Markov Exchange Economy", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18355", "id": "18355", "proceeding": "https://proceedings.mlr.press/v162/liu22l.html", "poster": "", "slides": "/media/icml-2022/Slides/18355.pdf", "author_site": "ZHIHAN LIU, Lu Miao, Zhaoran Wang, Michael Jordan, Zhuoran Yang", "author": "Zhihan Liu; Miao Lu; Zhaoran Wang; Michael Jordan; Zhuoran Yang", "abstract": "We study a bilevel economic system, which we refer to as a", "bibtex": "@InProceedings{pmlr-v162-liu22l,\n title = \t {Welfare Maximization in Competitive Equilibrium: Reinforcement Learning for {M}arkov Exchange Economy},\n author = {Liu, Zhihan and Lu, Miao and Wang, Zhaoran and Jordan, Michael and Yang, Zhuoran},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {13870--13911},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/liu22l/liu22l.pdf},\n url = \t {https://proceedings.mlr.press/v162/liu22l.html},\n abstract = \t {We study a bilevel economic system, which we refer to as a", "pdf": "https://proceedings.mlr.press/v162/liu22l/liu22l.pdf", "supp": "", "pdf_size": 740092, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=277514496663444344&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "aff": "Department of Industrial Engineering and Management Sciences, Northwestern University; School of the Gifted Young, University of Science and Technology of China; Department of Industrial Engineering and Management Sciences, Northwestern University; Department of Statistics, University of California, Berkeley; Department of Statistics and Data Science, Yale University", "aff_domain": "u.northwestern.edu;mail.ustc.edu.cn;gmail.com;cs.berkeley.edu;yale.edu", "email": "u.northwestern.edu;mail.ustc.edu.cn;gmail.com;cs.berkeley.edu;yale.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/liu22l.html", "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "Northwestern University;University of Science and Technology of China;University of California, Berkeley;Yale University", "aff_unique_dep": "Department of Industrial Engineering and Management Sciences;School of the Gifted Young;Department of Statistics;Department of Statistics and Data Science", "aff_unique_url": "https://www.northwestern.edu;http://www.ustc.edu.cn;https://www.berkeley.edu;https://www.yale.edu", "aff_unique_abbr": "NU;USTC;UC Berkeley;Yale", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;China" }, { "title": "What Can Linear Interpolation of Neural Network Loss Landscapes Tell Us?", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16391", "id": "16391", "proceeding": "https://proceedings.mlr.press/v162/vlaar22a.html", "poster": "/media/PosterPDFs/ICML%202022/7d6548bdc0082aacc950ed35e91fcccb.png?t=1657625084.7335305", "slides": "/media/icml-2022/Slides/16391.pdf", "author_site": "Tiffany Vlaar, Jonathan Frankle", "author": "Tiffany J Vlaar; Jonathan Frankle", "abstract": "Studying neural network loss landscapes provides insights into the nature of the underlying optimization problems. Unfortunately, loss landscapes are notoriously difficult to visualize in a human-comprehensible fashion. One common way to address this problem is to plot linear slices of the landscape, for example from the initial state of the network to the final state after optimization. On the basis of this analysis, prior work has drawn broader conclusions about the difficulty of the optimization problem. In this paper, we put inferences of this kind to the test, systematically evaluating how linear interpolation and final performance vary when altering the data, choice of initialization, and other optimizer and architecture design choices. Further, we use linear interpolation to study the role played by individual layers and substructures of the network. We find that certain layers are more sensitive to the choice of initialization, but that the shape of the linear path is not indicative of the changes in test accuracy of the model. Our results cast doubt on the broader intuition that the presence or absence of barriers when interpolating necessarily relates to the success of optimization.", "bibtex": "@InProceedings{pmlr-v162-vlaar22a,\n title = \t {What Can Linear Interpolation of Neural Network Loss Landscapes Tell Us?},\n author = {Vlaar, Tiffany J and Frankle, Jonathan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22325--22341},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/vlaar22a/vlaar22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/vlaar22a.html},\n abstract = \t {Studying neural network loss landscapes provides insights into the nature of the underlying optimization problems. Unfortunately, loss landscapes are notoriously difficult to visualize in a human-comprehensible fashion. One common way to address this problem is to plot linear slices of the landscape, for example from the initial state of the network to the final state after optimization. On the basis of this analysis, prior work has drawn broader conclusions about the difficulty of the optimization problem. In this paper, we put inferences of this kind to the test, systematically evaluating how linear interpolation and final performance vary when altering the data, choice of initialization, and other optimizer and architecture design choices. Further, we use linear interpolation to study the role played by individual layers and substructures of the network. We find that certain layers are more sensitive to the choice of initialization, but that the shape of the linear path is not indicative of the changes in test accuracy of the model. Our results cast doubt on the broader intuition that the presence or absence of barriers when interpolating necessarily relates to the success of optimization.}\n}", "pdf": "https://proceedings.mlr.press/v162/vlaar22a/vlaar22a.pdf", "supp": "", "pdf_size": 1709019, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17268083910123866748&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Mathematics, University of Edinburgh, Edinburgh, United Kingdom+MosaicML; MosaicML", "aff_domain": "ed.ac.uk;mosaicml.com", "email": "ed.ac.uk;mosaicml.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/vlaar22a.html", "aff_unique_index": "0+1;1", "aff_unique_norm": "University of Edinburgh;MosaicML", "aff_unique_dep": "Department of Mathematics;", "aff_unique_url": "https://www.ed.ac.uk;https://www.mosaicml.com", "aff_unique_abbr": "Edinburgh;MosaicML", "aff_campus_unique_index": "0", "aff_campus_unique": "Edinburgh;", "aff_country_unique_index": "0+1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "What Dense Graph Do You Need for Self-Attention?", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17503", "id": "17503", "proceeding": "https://proceedings.mlr.press/v162/wang22l.html", "poster": "/media/PosterPDFs/ICML%202022/e721a54a8cf18c8543d44782d9ef681f.png?t=1656658221.6100376", "slides": "", "author_site": "Yuxin Wang, Chu-Tak Lee, Qipeng Guo, Zhangyue Yin, yunhua zhou, Xuanjing Huang, Xipeng Qiu", "author": "Yuxin Wang; Chu-Tak Lee; Qipeng Guo; Zhangyue Yin; Yunhua Zhou; Xuanjing Huang; Xipeng Qiu", "abstract": "Transformers have made progress in miscellaneous tasks, but suffer from quadratic computational and memory complexities. Recent works propose sparse transformers with attention on sparse graphs to reduce complexity and remain strong performance. While effective, the crucial parts of how dense a graph needs to be to perform well are not fully explored. In this paper, we propose Normalized Information Payload (NIP), a graph scoring function measuring information transfer on graph, which provides an analysis tool for trade-offs between performance and complexity. Guided by this theoretical analysis, we present Hypercube Transformer, a sparse transformer that models token interactions in a hypercube and shows comparable or even better results with vanilla transformer while yielding $O(N\\log N)$ complexity with sequence length $N$. Experiments on tasks requiring various sequence lengths lay validation for our graph function well.", "bibtex": "@InProceedings{pmlr-v162-wang22l,\n title = \t {What Dense Graph Do You Need for Self-Attention?},\n author = {Wang, Yuxin and Lee, Chu-Tak and Guo, Qipeng and Yin, Zhangyue and Zhou, Yunhua and Huang, Xuanjing and Qiu, Xipeng},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22752--22768},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22l/wang22l.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22l.html},\n abstract = \t {Transformers have made progress in miscellaneous tasks, but suffer from quadratic computational and memory complexities. Recent works propose sparse transformers with attention on sparse graphs to reduce complexity and remain strong performance. While effective, the crucial parts of how dense a graph needs to be to perform well are not fully explored. In this paper, we propose Normalized Information Payload (NIP), a graph scoring function measuring information transfer on graph, which provides an analysis tool for trade-offs between performance and complexity. Guided by this theoretical analysis, we present Hypercube Transformer, a sparse transformer that models token interactions in a hypercube and shows comparable or even better results with vanilla transformer while yielding $O(N\\log N)$ complexity with sequence length $N$. Experiments on tasks requiring various sequence lengths lay validation for our graph function well.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22l/wang22l.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/wang22l-supp.zip", "pdf_size": 1210299, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6817431716045479667&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "School of Computer Science, Fudan University+Institute of Modern Languages and Linguistics, Fudan University; School of Computer Science, Fudan University; School of Computer Science, Fudan University; School of Computer Science, Fudan University; School of Computer Science, Fudan University; School of Computer Science, Fudan University+Institute of Modern Languages and Linguistics, Fudan University; School of Computer Science, Fudan University+Peng Cheng Laboratory", "aff_domain": "m.fudan.edu.cn; ; ; ; ; ;fudan.edu.cn", "email": "m.fudan.edu.cn; ; ; ; ; ;fudan.edu.cn", "github": "https://github.com/yxzwang/Normalized-Information-Payload", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/wang22l.html", "aff_unique_index": "0+0;0;0;0;0;0+0;0+1", "aff_unique_norm": "Fudan University;Pengcheng Laboratory", "aff_unique_dep": "School of Computer Science;Peng Cheng Laboratory", "aff_unique_url": "https://www.fudan.edu.cn;http://www.pcl.ac.cn", "aff_unique_abbr": "Fudan;PCL", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0+0;0+0", "aff_country_unique": "China" }, { "title": "What Language Model Architecture and Pretraining Objective Works Best for Zero-Shot Generalization?", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18141", "id": "18141", "proceeding": "https://proceedings.mlr.press/v162/wang22u.html", "poster": "/media/PosterPDFs/ICML%202022/0a716fe8c7745e51a3185fc8be6ca23a_393nYls.png?t=1657975820.9592764", "slides": "", "author_site": "Thomas Wang, Adam Roberts, Daniel Hesslow, Teven Le Scao, Hyung Won Chung, Iz Beltagy, Julien Launay, Colin Raffel", "author": "Thomas Wang; Adam Roberts; Daniel Hesslow; Teven Le Scao; Hyung Won Chung; Iz Beltagy; Julien Launay; Colin Raffel", "abstract": "Large pretrained Transformer language models have been shown to exhibit zero-shot generalization, i.e. they can perform a wide variety of tasks that they were not explicitly trained on. However, the architectures and pretraining objectives used across state-of-the-art models differ significantly, and there has been limited systematic comparison of these factors. In this work, we present a large-scale evaluation of modeling choices and their impact on zero-shot generalization. In particular, we focus on text-to-text models and experiment with three model architectures (causal/non-causal decoder-only and encoder-decoder), trained with two different pretraining objectives (autoregressive and masked language modeling), and evaluated with and without multitask prompted finetuning. We train models with over 5 billion parameters for more than 168 billion tokens, thereby increasing the likelihood that our conclusions will transfer to even larger scales. Our experiments show that causal decoder-only models trained on an autoregressive language modeling objective exhibit the strongest zero-shot generalization after purely self-supervised pretraining. However, models with non-causal visibility on their input trained with a masked language modeling objective followed by multitask finetuning perform the best among our experiments. We therefore consider the adaptation of pretrained models across architectures and objectives. Code and checkpoints are available at https://github.com/bigscience- workshop/architecture-objective.", "bibtex": "@InProceedings{pmlr-v162-wang22u,\n title = \t {What Language Model Architecture and Pretraining Objective Works Best for Zero-Shot Generalization?},\n author = {Wang, Thomas and Roberts, Adam and Hesslow, Daniel and Scao, Teven Le and Chung, Hyung Won and Beltagy, Iz and Launay, Julien and Raffel, Colin},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {22964--22984},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22u/wang22u.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22u.html},\n abstract = \t {Large pretrained Transformer language models have been shown to exhibit zero-shot generalization, i.e. they can perform a wide variety of tasks that they were not explicitly trained on. However, the architectures and pretraining objectives used across state-of-the-art models differ significantly, and there has been limited systematic comparison of these factors. In this work, we present a large-scale evaluation of modeling choices and their impact on zero-shot generalization. In particular, we focus on text-to-text models and experiment with three model architectures (causal/non-causal decoder-only and encoder-decoder), trained with two different pretraining objectives (autoregressive and masked language modeling), and evaluated with and without multitask prompted finetuning. We train models with over 5 billion parameters for more than 168 billion tokens, thereby increasing the likelihood that our conclusions will transfer to even larger scales. Our experiments show that causal decoder-only models trained on an autoregressive language modeling objective exhibit the strongest zero-shot generalization after purely self-supervised pretraining. However, models with non-causal visibility on their input trained with a masked language modeling objective followed by multitask finetuning perform the best among our experiments. We therefore consider the adaptation of pretrained models across architectures and objectives. Code and checkpoints are available at https://github.com/bigscience- workshop/architecture-objective.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22u/wang22u.pdf", "supp": "", "pdf_size": 842702, "gs_citation": 214, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6052998605768030776&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Hugging Face; Google; LightOn; Allen Institute for AI; LPENS, \u00c9cole Normale Sup\u00e9rieure; Hugging Face; LightOn; Hugging Face", "aff_domain": "huggingface.co;google.com; ; ; ; ; ; ", "email": "huggingface.co;google.com; ; ; ; ; ; ", "github": "https://github.com/bigscience-workshop/architecture-objective", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/wang22u.html", "aff_unique_index": "0;1;2;3;4;0;2;0", "aff_unique_norm": "Hugging Face;Google;LightOn;Allen Institute for AI;\u00c9cole Normale Sup\u00e9rieure", "aff_unique_dep": ";Google;;;LPENS", "aff_unique_url": "https://huggingface.co;https://www.google.com;;https://allenai.org;https://www.ens.fr", "aff_unique_abbr": "Hugging Face;Google;;AI2;ENS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;2;0;0", "aff_country_unique": "United States;;France" }, { "title": "When AUC meets DRO: Optimizing Partial AUC for Deep Learning with Non-Convex Convergence Guarantee", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18411", "id": "18411", "proceeding": "https://proceedings.mlr.press/v162/zhu22g.html", "poster": "/media/PosterPDFs/ICML%202022/0765933456f074d2c75bbbad63af95e6_FxgCf9b.png?t=1658188248.3242273", "slides": "/media/icml-2022/Slides/18411.pdf", "author_site": "Dixian Zhu, Gang Li, Bokun Wang, Xiaodong Wu, Tianbao Yang", "author": "Dixian Zhu; Gang Li; Bokun Wang; Xiaodong Wu; Tianbao Yang", "abstract": "In this paper, we propose systematic and efficient gradient-based methods for both one-way and two-way partial AUC (pAUC) maximization that are applicable to deep learning. We propose new formulations of pAUC surrogate objectives by using the distributionally robust optimization (DRO) to define the loss for each individual positive data. We consider two formulations of DRO, one of which is based on conditional-value-at-risk (CVaR) that yields a non-smooth but exact estimator for pAUC, and another one is based on a KL divergence regularized DRO that yields an inexact but smooth (soft) estimator for pAUC. For both one-way and two-way pAUC maximization, we propose two algorithms and prove their convergence for optimizing their two formulations, respectively. Experiments demonstrate the effectiveness of the proposed algorithms for pAUC maximization for deep learning on various datasets.", "bibtex": "@InProceedings{pmlr-v162-zhu22g,\n title = \t {When {AUC} meets {DRO}: Optimizing Partial {AUC} for Deep Learning with Non-Convex Convergence Guarantee},\n author = {Zhu, Dixian and Li, Gang and Wang, Bokun and Wu, Xiaodong and Yang, Tianbao},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {27548--27573},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhu22g/zhu22g.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhu22g.html},\n abstract = \t {In this paper, we propose systematic and efficient gradient-based methods for both one-way and two-way partial AUC (pAUC) maximization that are applicable to deep learning. We propose new formulations of pAUC surrogate objectives by using the distributionally robust optimization (DRO) to define the loss for each individual positive data. We consider two formulations of DRO, one of which is based on conditional-value-at-risk (CVaR) that yields a non-smooth but exact estimator for pAUC, and another one is based on a KL divergence regularized DRO that yields an inexact but smooth (soft) estimator for pAUC. For both one-way and two-way pAUC maximization, we propose two algorithms and prove their convergence for optimizing their two formulations, respectively. Experiments demonstrate the effectiveness of the proposed algorithms for pAUC maximization for deep learning on various datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/zhu22g/zhu22g.pdf", "supp": "", "pdf_size": 1110392, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2289692722096317584&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, University of Iowa, Iowa City, Iowa, USA+Department of Electrical and Computer Engineering, University of Iowa, Iowa City, Iowa, USA; Department of Computer Science, University of Iowa, Iowa City, Iowa, USA+Department of Electrical and Computer Engineering, University of Iowa, Iowa City, Iowa, USA; Department of Computer Science, University of Iowa, Iowa City, Iowa, USA; Department of Electrical and Computer Engineering, University of Iowa, Iowa City, Iowa, USA; Department of Computer Science, University of Iowa, Iowa City, Iowa, USA", "aff_domain": "uiowa.edu;uiowa.edu;uiowa.edu;uiowa.edu;uiowa.edu", "email": "uiowa.edu;uiowa.edu;uiowa.edu;uiowa.edu;uiowa.edu", "github": "", "project": "www.libauc.org", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/zhu22g.html", "aff_unique_index": "0+0;0+0;0;0;0", "aff_unique_norm": "University of Iowa", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.uiowa.edu", "aff_unique_abbr": "UIowa", "aff_campus_unique_index": "0+0;0+0;0;0;0", "aff_campus_unique": "Iowa City", "aff_country_unique_index": "0+0;0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "When Are Linear Stochastic Bandits Attackable?", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16887", "id": "16887", "proceeding": "https://proceedings.mlr.press/v162/wang22ai.html", "poster": "/media/PosterPDFs/ICML%202022/602443a3d6907117d8b4a308844e963e_oIEYNiO.png?t=1658372487.5431788", "slides": "", "author_site": "Huazheng Wang, Haifeng Xu, Hongning Wang", "author": "Huazheng Wang; Haifeng Xu; Hongning Wang", "abstract": "We study adversarial attacks on linear stochastic bandits: by manipulating the rewards, an adversary aims to control the behaviour of the bandit algorithm. Perhaps surprisingly, we first show that some attack goals can never be achieved. This is in a sharp contrast to context-free stochastic bandits, and is intrinsically due to the correlation among arms in linear stochastic bandits. Motivated by this finding, this paper studies the attackability of a $k$-armed linear bandit environment. We first provide a complete necessity and sufficiency characterization of attackability based on the geometry of the arms\u2019 context vectors. We then propose a two-stage attack method against LinUCB and Robust Phase Elimination. The method first asserts whether the given environment is attackable; and if yes, it poisons the rewards to force the algorithm to pull a target arm linear times using only a sublinear cost. Numerical experiments further validate the effectiveness and cost-efficiency of the proposed attack method.", "bibtex": "@InProceedings{pmlr-v162-wang22ai,\n title = \t {When Are Linear Stochastic Bandits Attackable?},\n author = {Wang, Huazheng and Xu, Haifeng and Wang, Hongning},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {23254--23273},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/wang22ai/wang22ai.pdf},\n url = \t {https://proceedings.mlr.press/v162/wang22ai.html},\n abstract = \t {We study adversarial attacks on linear stochastic bandits: by manipulating the rewards, an adversary aims to control the behaviour of the bandit algorithm. Perhaps surprisingly, we first show that some attack goals can never be achieved. This is in a sharp contrast to context-free stochastic bandits, and is intrinsically due to the correlation among arms in linear stochastic bandits. Motivated by this finding, this paper studies the attackability of a $k$-armed linear bandit environment. We first provide a complete necessity and sufficiency characterization of attackability based on the geometry of the arms\u2019 context vectors. We then propose a two-stage attack method against LinUCB and Robust Phase Elimination. The method first asserts whether the given environment is attackable; and if yes, it poisons the rewards to force the algorithm to pull a target arm linear times using only a sublinear cost. Numerical experiments further validate the effectiveness and cost-efficiency of the proposed attack method.}\n}", "pdf": "https://proceedings.mlr.press/v162/wang22ai/wang22ai.pdf", "supp": "", "pdf_size": 601312, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3481795971771177064&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Princeton University; University of Virginia; University of Virginia", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/wang22ai.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Princeton University;University of Virginia", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.virginia.edu", "aff_unique_abbr": "Princeton;UVA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "When and How Mixup Improves Calibration", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16683", "id": "16683", "proceeding": "https://proceedings.mlr.press/v162/zhang22f.html", "poster": "", "slides": "", "author_site": "Linjun Zhang, Zhun Deng, Kenji Kawaguchi, James Zou", "author": "Linjun Zhang; Zhun Deng; Kenji Kawaguchi; James Zou", "abstract": "In many machine learning applications, it is important for the model to provide confidence scores that accurately capture its prediction uncertainty. Although modern learning methods have achieved great success in predictive accuracy, generating calibrated confidence scores remains a major challenge. Mixup, a popular yet simple data augmentation technique based on taking convex combinations of pairs of training examples, has been empirically found to significantly improve confidence calibration across diverse applications. However, when and how Mixup helps calibration is still a mystery. In this paper, we theoretically prove that Mixup improves calibration in", "bibtex": "@InProceedings{pmlr-v162-zhang22f,\n title = \t {When and How Mixup Improves Calibration},\n author = {Zhang, Linjun and Deng, Zhun and Kawaguchi, Kenji and Zou, James},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {26135--26160},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/zhang22f/zhang22f.pdf},\n url = \t {https://proceedings.mlr.press/v162/zhang22f.html},\n abstract = \t {In many machine learning applications, it is important for the model to provide confidence scores that accurately capture its prediction uncertainty. Although modern learning methods have achieved great success in predictive accuracy, generating calibrated confidence scores remains a major challenge. Mixup, a popular yet simple data augmentation technique based on taking convex combinations of pairs of training examples, has been empirically found to significantly improve confidence calibration across diverse applications. However, when and how Mixup helps calibration is still a mystery. In this paper, we theoretically prove that Mixup improves calibration in", "pdf": "https://proceedings.mlr.press/v162/zhang22f/zhang22f.pdf", "supp": "", "pdf_size": 1534489, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11660657928741659824&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Rutgers University; Harvard University; National University of Singapore; Stanford University", "aff_domain": "rutgers.edu;g.harvard.edu; ; ", "email": "rutgers.edu;g.harvard.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/zhang22f.html", "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Rutgers University;Harvard University;National University of Singapore;Stanford University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.rutgers.edu;https://www.harvard.edu;https://www.nus.edu.sg;https://www.stanford.edu", "aff_unique_abbr": "Rutgers;Harvard;NUS;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Singapore" }, { "title": "Why Should I Trust You, Bellman? The Bellman Error is a Poor Replacement for Value Error", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16605", "id": "16605", "proceeding": "https://proceedings.mlr.press/v162/fujimoto22a.html", "poster": "", "slides": "", "author_site": "Scott Fujimoto, David Meger, Doina Precup, Ofir Nachum, Shixiang Gu", "author": "Scott Fujimoto; David Meger; Doina Precup; Ofir Nachum; Shixiang Shane Gu", "abstract": "In this work, we study the use of the Bellman equation as a surrogate objective for value prediction accuracy. While the Bellman equation is uniquely solved by the true value function over all state-action pairs, we find that the Bellman error (the difference between both sides of the equation) is a poor proxy for the accuracy of the value function. In particular, we show that (1) due to cancellations from both sides of the Bellman equation, the magnitude of the Bellman error is only weakly related to the distance to the true value function, even when considering all state-action pairs, and (2) in the finite data regime, the Bellman equation can be satisfied exactly by infinitely many suboptimal solutions. This means that the Bellman error can be minimized without improving the accuracy of the value function. We demonstrate these phenomena through a series of propositions, illustrative toy examples, and empirical analysis in standard benchmark domains.", "bibtex": "@InProceedings{pmlr-v162-fujimoto22a,\n title = \t {Why Should I Trust You, Bellman? {T}he {B}ellman Error is a Poor Replacement for Value Error},\n author = {Fujimoto, Scott and Meger, David and Precup, Doina and Nachum, Ofir and Gu, Shixiang Shane},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {6918--6943},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/fujimoto22a/fujimoto22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/fujimoto22a.html},\n abstract = \t {In this work, we study the use of the Bellman equation as a surrogate objective for value prediction accuracy. While the Bellman equation is uniquely solved by the true value function over all state-action pairs, we find that the Bellman error (the difference between both sides of the equation) is a poor proxy for the accuracy of the value function. In particular, we show that (1) due to cancellations from both sides of the Bellman equation, the magnitude of the Bellman error is only weakly related to the distance to the true value function, even when considering all state-action pairs, and (2) in the finite data regime, the Bellman equation can be satisfied exactly by infinitely many suboptimal solutions. This means that the Bellman error can be minimized without improving the accuracy of the value function. We demonstrate these phenomena through a series of propositions, illustrative toy examples, and empirical analysis in standard benchmark domains.}\n}", "pdf": "https://proceedings.mlr.press/v162/fujimoto22a/fujimoto22a.pdf", "supp": "", "pdf_size": 10179249, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16156209391795811234&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Mila, McGill University+Google Research, Brain Team; Mila, McGill University; Mila, McGill University+DeepMind; Google Research, Brain Team; Google Research, Brain Team", "aff_domain": "mail.mcgill.ca; ; ; ; ", "email": "mail.mcgill.ca; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v162/fujimoto22a.html", "aff_unique_index": "0+1;0;0+2;1;1", "aff_unique_norm": "McGill University;Google;DeepMind", "aff_unique_dep": "Mila;Google Research;", "aff_unique_url": "https://www.mcgill.ca;https://research.google;https://deepmind.com", "aff_unique_abbr": "McGill;Google;DeepMind", "aff_campus_unique_index": "0+1;0;0;1;1", "aff_campus_unique": "Montreal;Mountain View;", "aff_country_unique_index": "0+1;0;0+2;1;1", "aff_country_unique": "Canada;United States;United Kingdom" }, { "title": "Why the Rich Get Richer? On the Balancedness of Random Partition Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16513", "id": "16513", "proceeding": "https://proceedings.mlr.press/v162/lee22j.html", "poster": "/media/PosterPDFs/ICML%202022/e0a209539d1e74ab9fe46b9e01a19a97_ux87vmm.png?t=1657201491.9772666", "slides": "", "author_site": "Changwoo Lee, Huiyan Sang", "author": "Changwoo J Lee; Huiyan Sang", "abstract": "Random partition models are widely used in Bayesian methods for various clustering tasks, such as mixture models, topic models, and community detection problems. While the number of clusters induced by random partition models has been studied extensively, another important model property regarding the balancedness of partition has been largely neglected. We formulate a framework to define and theoretically study the balancedness of exchangeable random partition models, by analyzing how a model assigns probabilities to partitions with different levels of balancedness. We demonstrate that the \"rich-get-richer\" characteristic of many existing popular random partition models is an inevitable consequence of two common assumptions: product-form exchangeability and projectivity. We propose a principled way to compare the balancedness of random partition models, which gives a better understanding of what model works better and what doesn\u2019t for different applications. We also introduce the \"rich-get-poorer\" random partition models and illustrate their application to entity resolution tasks.", "bibtex": "@InProceedings{pmlr-v162-lee22j,\n title = \t {Why the Rich Get Richer? {O}n the Balancedness of Random Partition Models},\n author = {Lee, Changwoo J and Sang, Huiyan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {12521--12541},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/lee22j/lee22j.pdf},\n url = \t {https://proceedings.mlr.press/v162/lee22j.html},\n abstract = \t {Random partition models are widely used in Bayesian methods for various clustering tasks, such as mixture models, topic models, and community detection problems. While the number of clusters induced by random partition models has been studied extensively, another important model property regarding the balancedness of partition has been largely neglected. We formulate a framework to define and theoretically study the balancedness of exchangeable random partition models, by analyzing how a model assigns probabilities to partitions with different levels of balancedness. We demonstrate that the \"rich-get-richer\" characteristic of many existing popular random partition models is an inevitable consequence of two common assumptions: product-form exchangeability and projectivity. We propose a principled way to compare the balancedness of random partition models, which gives a better understanding of what model works better and what doesn\u2019t for different applications. We also introduce the \"rich-get-poorer\" random partition models and illustrate their application to entity resolution tasks.}\n}", "pdf": "https://proceedings.mlr.press/v162/lee22j/lee22j.pdf", "supp": "", "pdf_size": 3227209, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16428432009568142650&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Statistics, Texas A&M University, Texas, USA; Department of Statistics, Texas A&M University, Texas, USA", "aff_domain": "stat.tamu.edu; ", "email": "stat.tamu.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v162/lee22j.html", "aff_unique_index": "0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Texas", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Wide Bayesian neural networks have a simple weight posterior: theory and accelerated sampling", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18159", "id": "18159", "proceeding": "https://proceedings.mlr.press/v162/hron22a.html", "poster": "", "slides": "", "author_site": "Jiri Hron, Roman Novak, Jeffrey Pennington, Jascha Sohl-Dickstein", "author": "Jiri Hron; Roman Novak; Jeffrey Pennington; Jascha Sohl-Dickstein", "abstract": "We introduce repriorisation, a data-dependent reparameterisation which transforms a Bayesian neural network (BNN) posterior to a distribution whose KL divergence to the BNN prior vanishes as layer widths grow. The repriorisation map acts directly on parameters, and its analytic simplicity complements the known neural network Gaussian process (NNGP) behaviour of wide BNNs in function space. Exploiting the repriorisation, we develop a Markov chain Monte Carlo (MCMC) posterior sampling algorithm which mixes faster the wider the BNN. This contrasts with the typically poor performance of MCMC in high dimensions. We observe up to 50x higher effective sample size relative to no reparametrisation for both fully-connected and residual networks. Improvements are achieved at all widths, with the margin between reparametrised and standard BNNs growing with layer width.", "bibtex": "@InProceedings{pmlr-v162-hron22a,\n title = \t {Wide {B}ayesian neural networks have a simple weight posterior: theory and accelerated sampling},\n author = {Hron, Jiri and Novak, Roman and Pennington, Jeffrey and Sohl-Dickstein, Jascha},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8926--8945},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/hron22a/hron22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/hron22a.html},\n abstract = \t {We introduce repriorisation, a data-dependent reparameterisation which transforms a Bayesian neural network (BNN) posterior to a distribution whose KL divergence to the BNN prior vanishes as layer widths grow. The repriorisation map acts directly on parameters, and its analytic simplicity complements the known neural network Gaussian process (NNGP) behaviour of wide BNNs in function space. Exploiting the repriorisation, we develop a Markov chain Monte Carlo (MCMC) posterior sampling algorithm which mixes faster the wider the BNN. This contrasts with the typically poor performance of MCMC in high dimensions. We observe up to 50x higher effective sample size relative to no reparametrisation for both fully-connected and residual networks. Improvements are achieved at all widths, with the margin between reparametrised and standard BNNs growing with layer width.}\n}", "pdf": "https://proceedings.mlr.press/v162/hron22a/hron22a.pdf", "supp": "", "pdf_size": 1468453, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16784492759719695101&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Google Research, USA+University of Cambridge, UK; Google Research, USA; Google Research, USA; Google Research, USA", "aff_domain": "cam.ac.uk; ; ; ", "email": "cam.ac.uk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v162/hron22a.html", "aff_unique_index": "0+1;0;0;0", "aff_unique_norm": "Google;University of Cambridge", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://www.cam.ac.uk", "aff_unique_abbr": "Google;Cambridge", "aff_campus_unique_index": "0+1;0;0;0", "aff_campus_unique": "Mountain View;Cambridge", "aff_country_unique_index": "0+1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Wide Neural Networks Forget Less Catastrophically", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17091", "id": "17091", "proceeding": "https://proceedings.mlr.press/v162/mirzadeh22a.html", "poster": "/media/PosterPDFs/ICML%202022/febb7aafcbde43930cdedf8c5153c867.png?t=1658101219.462107", "slides": "/media/icml-2022/Slides/17091_8OsmNVd.pdf", "author_site": "Seyed Iman Mirzadeh, Arslan Chaudhry, Dong Yin, Huiyi Hu, Razvan Pascanu, Dilan Gorur, Mehrdad Farajtabar", "author": "Seyed Iman Mirzadeh; Arslan Chaudhry; Dong Yin; Huiyi Hu; Razvan Pascanu; Dilan Gorur; Mehrdad Farajtabar", "abstract": "A primary focus area in continual learning research is alleviating the \"catastrophic forgetting\" problem in neural networks by designing new algorithms that are more robust to the distribution shifts. While the recent progress in continual learning literature is encouraging, our understanding of what properties of neural networks contribute to catastrophic forgetting is still limited. To address this, instead of focusing on continual learning algorithms, in this work, we focus on the model itself and study the impact of \"width\" of the neural network architecture on catastrophic forgetting, and show that width has a surprisingly significant effect on forgetting. To explain this effect, we study the learning dynamics of the network from various perspectives such as gradient orthogonality, sparsity, and lazy training regime. We provide potential explanations that are consistent with the empirical results across different architectures and continual learning benchmarks.", "bibtex": "@InProceedings{pmlr-v162-mirzadeh22a,\n title = \t {Wide Neural Networks Forget Less Catastrophically},\n author = {Mirzadeh, Seyed Iman and Chaudhry, Arslan and Yin, Dong and Hu, Huiyi and Pascanu, Razvan and Gorur, Dilan and Farajtabar, Mehrdad},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {15699--15717},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mirzadeh22a/mirzadeh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mirzadeh22a.html},\n abstract = \t {A primary focus area in continual learning research is alleviating the \"catastrophic forgetting\" problem in neural networks by designing new algorithms that are more robust to the distribution shifts. While the recent progress in continual learning literature is encouraging, our understanding of what properties of neural networks contribute to catastrophic forgetting is still limited. To address this, instead of focusing on continual learning algorithms, in this work, we focus on the model itself and study the impact of \"width\" of the neural network architecture on catastrophic forgetting, and show that width has a surprisingly significant effect on forgetting. To explain this effect, we study the learning dynamics of the network from various perspectives such as gradient orthogonality, sparsity, and lazy training regime. We provide potential explanations that are consistent with the empirical results across different architectures and continual learning benchmarks.}\n}", "pdf": "https://proceedings.mlr.press/v162/mirzadeh22a/mirzadeh22a.pdf", "supp": "", "pdf_size": 650068, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1868657248956733438&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Washington State University + DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind", "aff_domain": "wsu.edu; ; ; ; ; ;google.com", "email": "wsu.edu; ; ; ; ; ;google.com", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v162/mirzadeh22a.html", "aff_unique_index": "0+1;1;1;1;1;1;1", "aff_unique_norm": "Washington State University;DeepMind", "aff_unique_dep": ";", "aff_unique_url": "https://wsu.edu;https://deepmind.com", "aff_unique_abbr": "WSU;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;1;1;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Winning the Lottery Ahead of Time: Efficient Early Network Pruning", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17709", "id": "17709", "proceeding": "https://proceedings.mlr.press/v162/rachwan22a.html", "poster": "/media/PosterPDFs/ICML%202022/0ed9422357395a0d4879191c66f4faa2.png?t=1657449613.8359313", "slides": "", "author_site": "John Rachwan, Daniel Z\u00fcgner, Bertrand Charpentier, Simon Geisler, Morgane Ayle, Stephan G\u00fcnnemann", "author": "John Rachwan; Daniel Z\u00fcgner; Bertrand Charpentier; Simon Geisler; Morgane Ayle; Stephan G\u00fcnnemann", "abstract": "Pruning, the task of sparsifying deep neural networks, received increasing attention recently. Although state-of-the-art pruning methods extract highly sparse models, they neglect two main challenges: (1) the process of finding these sparse models is often very expensive; (2) unstructured pruning does not provide benefits in terms of GPU memory, training time, or carbon emissions. We propose Early Compression via Gradient Flow Preservation (EarlyCroP), which efficiently extracts state-of-the-art sparse models before or early in training addressing challenge (1), and can be applied in a structured manner addressing challenge (2). This enables us to train sparse networks on commodity GPUs whose dense versions would be too large, thereby saving costs and reducing hardware requirements. We empirically show that EarlyCroP outperforms a rich set of baselines for many tasks (incl. classification, regression) and domains (incl. computer vision, natural language processing, and reinforcment learning). EarlyCroP leads to accuracy comparable to dense training while outperforming pruning baselines.", "bibtex": "@InProceedings{pmlr-v162-rachwan22a,\n title = \t {Winning the Lottery Ahead of Time: Efficient Early Network Pruning},\n author = {Rachwan, John and Z{\\\"u}gner, Daniel and Charpentier, Bertrand and Geisler, Simon and Ayle, Morgane and G{\\\"u}nnemann, Stephan},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {18293--18309},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/rachwan22a/rachwan22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/rachwan22a.html},\n abstract = \t {Pruning, the task of sparsifying deep neural networks, received increasing attention recently. Although state-of-the-art pruning methods extract highly sparse models, they neglect two main challenges: (1) the process of finding these sparse models is often very expensive; (2) unstructured pruning does not provide benefits in terms of GPU memory, training time, or carbon emissions. We propose Early Compression via Gradient Flow Preservation (EarlyCroP), which efficiently extracts state-of-the-art sparse models before or early in training addressing challenge (1), and can be applied in a structured manner addressing challenge (2). This enables us to train sparse networks on commodity GPUs whose dense versions would be too large, thereby saving costs and reducing hardware requirements. We empirically show that EarlyCroP outperforms a rich set of baselines for many tasks (incl. classification, regression) and domains (incl. computer vision, natural language processing, and reinforcment learning). EarlyCroP leads to accuracy comparable to dense training while outperforming pruning baselines.}\n}", "pdf": "https://proceedings.mlr.press/v162/rachwan22a/rachwan22a.pdf", "supp": "", "pdf_size": 815980, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3167787605705434615&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Technical University Munich, Germany; Technical University Munich, Germany; Technical University Munich, Germany; Technical University Munich, Germany; Technical University Munich, Germany; Technical University Munich, Germany", "aff_domain": "tum.de; ; ; ; ; ", "email": "tum.de; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/rachwan22a.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Technical University Munich", "aff_unique_dep": "", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "XAI for Transformers: Better Explanations through Conservative Propagation", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17871", "id": "17871", "proceeding": "https://proceedings.mlr.press/v162/ali22a.html", "poster": "/media/PosterPDFs/ICML%202022/c366c2c97d47b02b24c3ecade4c40a01_krHdLcY.png?t=1657822452.5295537", "slides": "", "author_site": "Ameen Ali, Thomas Schnake, Oliver Eberle, Gr\u00e9goire Montavon, Klaus-robert Mueller, Lior Wolf", "author": "Ameen Ali; Thomas Schnake; Oliver Eberle; Gr\u00e9goire Montavon; Klaus-Robert M\u00fcller; Lior Wolf", "abstract": "Transformers have become an important workhorse of machine learning, with numerous applications. This necessitates the development of reliable methods for increasing their transparency. Multiple interpretability methods, often based on gradient information, have been proposed. We show that the gradient in a Transformer reflects the function only locally, and thus fails to reliably identify the contribution of input features to the prediction. We identify Attention Heads and LayerNorm as main reasons for such unreliable explanations and propose a more stable way for propagation through these layers. Our proposal, which can be seen as a proper extension of the well-established LRP method to Transformers, is shown both theoretically and empirically to overcome the deficiency of a simple gradient-based approach, and achieves state-of-the-art explanation performance on a broad range of Transformer models and datasets.", "bibtex": "@InProceedings{pmlr-v162-ali22a,\n title = \t {{XAI} for Transformers: Better Explanations through Conservative Propagation},\n author = {Ali, Ameen and Schnake, Thomas and Eberle, Oliver and Montavon, Gr{\\'e}goire and M{\\\"u}ller, Klaus-Robert and Wolf, Lior},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {435--451},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ali22a/ali22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ali22a.html},\n abstract = \t {Transformers have become an important workhorse of machine learning, with numerous applications. This necessitates the development of reliable methods for increasing their transparency. Multiple interpretability methods, often based on gradient information, have been proposed. We show that the gradient in a Transformer reflects the function only locally, and thus fails to reliably identify the contribution of input features to the prediction. We identify Attention Heads and LayerNorm as main reasons for such unreliable explanations and propose a more stable way for propagation through these layers. Our proposal, which can be seen as a proper extension of the well-established LRP method to Transformers, is shown both theoretically and empirically to overcome the deficiency of a simple gradient-based approach, and achieves state-of-the-art explanation performance on a broad range of Transformer models and datasets.}\n}", "pdf": "https://proceedings.mlr.press/v162/ali22a/ali22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/ali22a-supp.zip", "pdf_size": 4410805, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8318067021687688094&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "https://github.com/AmeenAli/XAI_Transformers", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/ali22a.html" }, { "title": "You Only Cut Once: Boosting Data Augmentation with a Single Cut", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17623", "id": "17623", "proceeding": "https://proceedings.mlr.press/v162/han22a.html", "poster": "", "slides": "", "author_site": "Junlin Han, Pengfei Fang, Weihao Li, Jie Hong, Mohammad Ali Armin, Ian Reid, Lars Petersson, HONGDONG LI", "author": "Junlin Han; Pengfei Fang; Weihao Li; Jie Hong; Mohammad Ali Armin; Ian Reid; Lars Petersson; Hongdong Li", "abstract": "We present You Only Cut Once (YOCO) for performing data augmentations. YOCO cuts one image into two pieces and performs data augmentations individually within each piece. Applying YOCO improves the diversity of the augmentation per sample and encourages neural networks to recognize objects from partial information. YOCO enjoys the properties of parameter-free, easy usage, and boosting almost all augmentations for free. Thorough experiments are conducted to evaluate its effectiveness. We first demonstrate that YOCO can be seamlessly applied to varying data augmentations, neural network architectures, and brings performance gains on CIFAR and ImageNet classification tasks, sometimes surpassing conventional image-level augmentation by large margins. Moreover, we show YOCO benefits contrastive pre-training toward a more powerful representation that can be better transferred to multiple downstream tasks. Finally, we study a number of variants of YOCO and empirically analyze the performance for respective settings.", "bibtex": "@InProceedings{pmlr-v162-han22a,\n title = \t {You Only Cut Once: Boosting Data Augmentation with a Single Cut},\n author = {Han, Junlin and Fang, Pengfei and Li, Weihao and Hong, Jie and Armin, Mohammad Ali and Reid, Ian and Petersson, Lars and Li, Hongdong},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {8196--8212},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/han22a/han22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/han22a.html},\n abstract = \t {We present You Only Cut Once (YOCO) for performing data augmentations. YOCO cuts one image into two pieces and performs data augmentations individually within each piece. Applying YOCO improves the diversity of the augmentation per sample and encourages neural networks to recognize objects from partial information. YOCO enjoys the properties of parameter-free, easy usage, and boosting almost all augmentations for free. Thorough experiments are conducted to evaluate its effectiveness. We first demonstrate that YOCO can be seamlessly applied to varying data augmentations, neural network architectures, and brings performance gains on CIFAR and ImageNet classification tasks, sometimes surpassing conventional image-level augmentation by large margins. Moreover, we show YOCO benefits contrastive pre-training toward a more powerful representation that can be better transferred to multiple downstream tasks. Finally, we study a number of variants of YOCO and empirically analyze the performance for respective settings.}\n}", "pdf": "https://proceedings.mlr.press/v162/han22a/han22a.pdf", "supp": "", "pdf_size": 2270598, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=501111593877482032&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Data61-CSIRO, Canberra, Australia+Australian National University, Canberra, Australia+University of Adelaide, Adelaide, Australia; Data61-CSIRO, Canberra, Australia+Australian National University, Canberra, Australia; Data61-CSIRO, Canberra, Australia; Data61-CSIRO, Canberra, Australia+Australian National University, Canberra, Australia; Data61-CSIRO, Canberra, Australia; University of Adelaide, Adelaide, Australia; Data61-CSIRO, Canberra, Australia; Australian National University, Canberra, Australia", "aff_domain": "data61.csiro.au; ; ; ; ; ; ; ", "email": "data61.csiro.au; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v162/han22a.html", "aff_unique_index": "0+1+2;0+1;0;0+1;0;2;0;1", "aff_unique_norm": "Commonwealth Scientific and Industrial Research Organisation;Australian National University;University of Adelaide", "aff_unique_dep": "Data61;;", "aff_unique_url": "https://www.csiro.au;https://www.anu.edu.au;https://www.adelaide.edu.au", "aff_unique_abbr": "CSIRO;ANU;Adelaide", "aff_campus_unique_index": "0+0+1;0+0;0;0+0;0;1;0;0", "aff_campus_unique": "Canberra;Adelaide", "aff_country_unique_index": "0+0+0;0+0;0;0+0;0;0;0;0", "aff_country_unique": "Australia" }, { "title": "YourTTS: Towards Zero-Shot Multi-Speaker TTS and Zero-Shot Voice Conversion for Everyone", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16091", "id": "16091", "proceeding": "https://proceedings.mlr.press/v162/casanova22a.html", "poster": "", "slides": "", "author_site": "Edresson Casanova, Julian Weber, Christopher Shulby, Arnaldo Candido Junior, Eren G\u00f6lge, Moacir Ponti", "author": "Edresson Casanova; Julian Weber; Christopher D Shulby; Arnaldo Candido Junior; Eren G\u00f6lge; Moacir A Ponti", "abstract": "YourTTS brings the power of a multilingual approach to the task of zero-shot multi-speaker TTS. Our method builds upon the VITS model and adds several novel modifications for zero-shot multi-speaker and multilingual training. We achieved state-of-the-art (SOTA) results in zero-shot multi-speaker TTS and results comparable to SOTA in zero-shot voice conversion on the VCTK dataset. Additionally, our approach achieves promising results in a target language with a single-speaker dataset, opening possibilities for zero-shot multi-speaker TTS and zero-shot voice conversion systems in low-resource languages. Finally, it is possible to fine-tune the YourTTS model with less than 1 minute of speech and achieve state-of-the-art results in voice similarity and with reasonable quality. This is important to allow synthesis for speakers with a very different voice or recording characteristics from those seen during training.", "bibtex": "@InProceedings{pmlr-v162-casanova22a,\n title = \t {{Y}our{TTS}: Towards Zero-Shot Multi-Speaker {TTS} and Zero-Shot Voice Conversion for Everyone},\n author = {Casanova, Edresson and Weber, Julian and Shulby, Christopher D and Junior, Arnaldo Candido and G{\\\"o}lge, Eren and Ponti, Moacir A},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {2709--2720},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/casanova22a/casanova22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/casanova22a.html},\n abstract = \t {YourTTS brings the power of a multilingual approach to the task of zero-shot multi-speaker TTS. Our method builds upon the VITS model and adds several novel modifications for zero-shot multi-speaker and multilingual training. We achieved state-of-the-art (SOTA) results in zero-shot multi-speaker TTS and results comparable to SOTA in zero-shot voice conversion on the VCTK dataset. Additionally, our approach achieves promising results in a target language with a single-speaker dataset, opening possibilities for zero-shot multi-speaker TTS and zero-shot voice conversion systems in low-resource languages. Finally, it is possible to fine-tune the YourTTS model with less than 1 minute of speech and achieve state-of-the-art results in voice similarity and with reasonable quality. This is important to allow synthesis for speakers with a very different voice or recording characteristics from those seen during training.}\n}", "pdf": "https://proceedings.mlr.press/v162/casanova22a/casanova22a.pdf", "supp": "", "pdf_size": 530983, "gs_citation": 526, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8575580251111777245&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Instituto de Ci\u00eancias Matem\u00e1ticas e de Computa\u00e7\u00e3o, Universidade de S\u00e3o Paulo, Brazil+Coqui, Germany; Sopra Banking Software, France+Coqui, Germany; Defined.ai, United States of America+Coqui, Germany; Federal University of Technology \u2013 Paran\u00e1, Brazil+Coqui, Germany; Coqui, Germany; Mercado Livre, Brazil+Coqui, Germany", "aff_domain": "usp.br; ; ; ; ; ", "email": "usp.br; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/casanova22a.html", "aff_unique_index": "0+1;2+1;3+1;4+1;1;5+1", "aff_unique_norm": "Universidade de S\u00e3o Paulo;Coqui;Sopra Banking Software;Defined.ai;Federal University of Technology \u2013 Paran\u00e1;Mercado Livre", "aff_unique_dep": "Instituto de Ci\u00eancias Matem\u00e1ticas e de Computa\u00e7\u00e3o;;;;;", "aff_unique_url": "https://www.icmc.usp.br;;https://www.sopra-banking.com;https://www.defined.ai;https://www.utfpr.edu.br;https://www.mercadolivre.com.br", "aff_unique_abbr": "USP;;;;UTFPR;ML", "aff_campus_unique_index": ";;;;", "aff_campus_unique": "", "aff_country_unique_index": "0+1;2+1;3+1;0+1;1;0+1", "aff_country_unique": "Brazil;Germany;France;United States" }, { "title": "Zero-Shot Reward Specification via Grounded Natural Language", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18091", "id": "18091", "proceeding": "https://proceedings.mlr.press/v162/mahmoudieh22a.html", "poster": "/media/PosterPDFs/ICML%202022/dcacff2565700c8f88f59cf4a16f9dfc.png?t=1658433932.5356255", "slides": "", "author_site": "Parsa Mahmoudieh, Deepak Pathak, Trevor Darrell", "author": "Parsa Mahmoudieh; Deepak Pathak; Trevor Darrell", "abstract": "Reward signals in reinforcement learning are expensive to design and often require access to the true state which is not available in the real world. Common alternatives are usually demonstrations or goal images which can be labor-intensive to collect. On the other hand, text descriptions provide a general, natural, and low-effort way of communicating the desired task. However, prior works in learning text-conditioned policies still rely on rewards that are defined using either true state or labeled expert demonstrations. We use recent developments in building large-scale visuolanguage models like CLIP to devise a framework that generates the task reward signal just from goal text description and raw pixel observations which is then used to learn the task policy. We evaluate the proposed framework on control and robotic manipulation tasks. Finally, we distill the individual task policies into a single goal text conditioned policy that can generalize in a zero-shot manner to new tasks with unseen objects and unseen goal text descriptions.", "bibtex": "@InProceedings{pmlr-v162-mahmoudieh22a,\n title = \t {Zero-Shot Reward Specification via Grounded Natural Language},\n author = {Mahmoudieh, Parsa and Pathak, Deepak and Darrell, Trevor},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {14743--14752},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/mahmoudieh22a/mahmoudieh22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/mahmoudieh22a.html},\n abstract = \t {Reward signals in reinforcement learning are expensive to design and often require access to the true state which is not available in the real world. Common alternatives are usually demonstrations or goal images which can be labor-intensive to collect. On the other hand, text descriptions provide a general, natural, and low-effort way of communicating the desired task. However, prior works in learning text-conditioned policies still rely on rewards that are defined using either true state or labeled expert demonstrations. We use recent developments in building large-scale visuolanguage models like CLIP to devise a framework that generates the task reward signal just from goal text description and raw pixel observations which is then used to learn the task policy. We evaluate the proposed framework on control and robotic manipulation tasks. Finally, we distill the individual task policies into a single goal text conditioned policy that can generalize in a zero-shot manner to new tasks with unseen objects and unseen goal text descriptions.}\n}", "pdf": "https://proceedings.mlr.press/v162/mahmoudieh22a/mahmoudieh22a.pdf", "supp": "", "pdf_size": 1324470, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7105001314229366579&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "UC Berkeley; Carnegie Mellon University; UC Berkeley", "aff_domain": "berkeley.edu; ; ", "email": "berkeley.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/mahmoudieh22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Berkeley;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.cmu.edu", "aff_unique_abbr": "UC Berkeley;CMU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Zero-shot AutoML with Pretrained Models", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/18007", "id": "18007", "proceeding": "https://proceedings.mlr.press/v162/ozturk22a.html", "poster": "/media/PosterPDFs/ICML%202022/2557911c1bf75c2b643afb4ecbfc8ec2.png?t=1657281815.0995193", "slides": "", "author_site": "Ekrem \u00d6zt\u00fcrk, Fabio Ferreira, Hadi S Jomaa, Lars Schmidt-Thieme, Josif Grabocka, Frank Hutter", "author": "Ekrem \u00d6zt\u00fcrk; Fabio Ferreira; Hadi Jomaa; Lars Schmidt-Thieme; Josif Grabocka; Frank Hutter", "abstract": "Given a new dataset D and a low compute budget, how should we choose a pre-trained model to fine-tune to D, and set the fine-tuning hyperparameters without risking overfitting, particularly if D is small? Here, we extend automated machine learning (AutoML) to best make these choices. Our domain-independent meta-learning approach learns a zero-shot surrogate model which, at test time, allows to select the right deep learning (DL) pipeline (including the pre-trained model and fine-tuning hyperparameters) for a new dataset D given only trivial meta-features describing D such as image resolution or the number of classes. To train this zero-shot model, we collect performance data for many DL pipelines on a large collection of datasets and meta-train on this data to minimize a pairwise ranking objective. We evaluate our approach under the strict time limit of the vision track of the ChaLearn AutoDL challenge benchmark, clearly outperforming all challenge contenders.", "bibtex": "@InProceedings{pmlr-v162-ozturk22a,\n title = \t {Zero-shot {A}uto{ML} with Pretrained Models},\n author = {{\\\"O}zt{\\\"u}rk, Ekrem and Ferreira, Fabio and Jomaa, Hadi and Schmidt-Thieme, Lars and Grabocka, Josif and Hutter, Frank},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {17138--17155},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/ozturk22a/ozturk22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/ozturk22a.html},\n abstract = \t {Given a new dataset D and a low compute budget, how should we choose a pre-trained model to fine-tune to D, and set the fine-tuning hyperparameters without risking overfitting, particularly if D is small? Here, we extend automated machine learning (AutoML) to best make these choices. Our domain-independent meta-learning approach learns a zero-shot surrogate model which, at test time, allows to select the right deep learning (DL) pipeline (including the pre-trained model and fine-tuning hyperparameters) for a new dataset D given only trivial meta-features describing D such as image resolution or the number of classes. To train this zero-shot model, we collect performance data for many DL pipelines on a large collection of datasets and meta-train on this data to minimize a pairwise ranking objective. We evaluate our approach under the strict time limit of the vision track of the ChaLearn AutoDL challenge benchmark, clearly outperforming all challenge contenders.}\n}", "pdf": "https://proceedings.mlr.press/v162/ozturk22a/ozturk22a.pdf", "supp": "", "pdf_size": 2015803, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4155086096102443249&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "University of Freiburg; University of Freiburg; University of Hildesheim; University of Hildesheim; University of Freiburg; Bosch Center for Artificial Intelligence", "aff_domain": "cs.uni-freiburg.de; ; ; ; ; ", "email": "cs.uni-freiburg.de; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/ozturk22a.html", "aff_unique_index": "0;0;1;1;0;2", "aff_unique_norm": "University of Freiburg;University of Hildesheim;Bosch Center for Artificial Intelligence", "aff_unique_dep": ";;Center for Artificial Intelligence", "aff_unique_url": "https://www.uni-freiburg.de;https://www.uni-hildesheim.de/;https://www.bosch-ai.com", "aff_unique_abbr": "UoF;;BCAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2022/poster/16643", "id": "16643", "proceeding": "https://proceedings.mlr.press/v162/baevski22a.html", "poster": "/media/PosterPDFs/ICML%202022/81e793dc8317a3dbc3534ed3f242c418.png?t=1657755671.0294495", "slides": "", "author_site": "Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli", "author": "Alexei Baevski; Wei-Ning Hsu; Qiantong Xu; Arun Babu; Jiatao Gu; Michael Auli", "abstract": "While the general idea of self-supervised learning is identical across modalities, the actual algorithms and objectives differ widely because they were developed with a single modality in mind. To get us closer to general self-supervised learning, we present data2vec, a framework that uses the same learning method for either speech, NLP or computer vision. The core idea is to predict latent representations of the full input data based on a masked view of the input in a self-distillation setup using a standard Transformer architecture. Instead of predicting modality-specific targets such as words, visual tokens or units of human speech which are local in nature, data2vec predicts contextualized latent representations that contain information from the entire input. Experiments on the major benchmarks of speech recognition, image classification, and natural language understanding demonstrate a new state of the art or competitive performance to predominant approaches.", "bibtex": "@InProceedings{pmlr-v162-baevski22a,\n title = \t {data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language},\n author = {Baevski, Alexei and Hsu, Wei-Ning and Xu, Qiantong and Babu, Arun and Gu, Jiatao and Auli, Michael},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {1298--1312},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/baevski22a/baevski22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/baevski22a.html},\n abstract = \t {While the general idea of self-supervised learning is identical across modalities, the actual algorithms and objectives differ widely because they were developed with a single modality in mind. To get us closer to general self-supervised learning, we present data2vec, a framework that uses the same learning method for either speech, NLP or computer vision. The core idea is to predict latent representations of the full input data based on a masked view of the input in a self-distillation setup using a standard Transformer architecture. Instead of predicting modality-specific targets such as words, visual tokens or units of human speech which are local in nature, data2vec predicts contextualized latent representations that contain information from the entire input. Experiments on the major benchmarks of speech recognition, image classification, and natural language understanding demonstrate a new state of the art or competitive performance to predominant approaches.}\n}", "pdf": "https://proceedings.mlr.press/v162/baevski22a/baevski22a.pdf", "supp": "", "pdf_size": 808672, "gs_citation": 1024, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12686412422242429370&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Meta AI; Meta AI; SambaNova + Meta AI; Meta AI; Meta AI; Meta AI", "aff_domain": "fb.com; ; ; ;fb.com; ", "email": "fb.com; ; ; ;fb.com; ", "github": "www.github.com/pytorch/fairseq/tree/master/examples/data2vec", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v162/baevski22a.html", "aff_unique_index": "0;0;1+0;0;0;0", "aff_unique_norm": "Meta;SambaNova Systems, Inc.", "aff_unique_dep": "Meta AI;", "aff_unique_url": "https://meta.com;https://www.sambanova.com", "aff_unique_abbr": "Meta;SambaNova", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "pathGCN: Learning General Graph Spatial Operators from Paths", "status": "Spotlight", "track": "main", "site": "https://icml.cc/virtual/2022/poster/17145", "id": "17145", "proceeding": "https://proceedings.mlr.press/v162/eliasof22a.html", "poster": "/media/PosterPDFs/ICML%202022/619953730129049907919279f29bd9d7.png?t=1657735390.5148747", "slides": "", "author_site": "Moshe Eliasof, Eldad Haber, Eran Treister", "author": "Moshe Eliasof; Eldad Haber; Eran Treister", "abstract": "Graph Convolutional Networks (GCNs), similarly to Convolutional Neural Networks (CNNs), are typically based on two main operations - spatial and point-wise convolutions. In the context of GCNs, differently from CNNs, a pre-determined spatial operator based on the graph Laplacian is often chosen, allowing only the point-wise operations to be learnt. However, learning a meaningful spatial operator is critical for developing more expressive GCNs for improved performance. In this paper we propose pathGCN, a novel approach to learn the spatial operator from random paths on the graph. We analyze the convergence of our method and its difference from existing GCNs. Furthermore, we discuss several options of combining our learnt spatial operator with point-wise convolutions. Our extensive experiments on numerous datasets suggest that by properly learning both the spatial and point-wise convolutions, phenomena like over-smoothing can be inherently avoided, and new state-of-the-art performance is achieved.", "bibtex": "@InProceedings{pmlr-v162-eliasof22a,\n title = \t {path{GCN}: Learning General Graph Spatial Operators from Paths},\n author = {Eliasof, Moshe and Haber, Eldad and Treister, Eran},\n booktitle = \t {Proceedings of the 39th International Conference on Machine Learning},\n pages = \t {5878--5891},\n year = \t {2022},\n editor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\n volume = \t {162},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {17--23 Jul},\n publisher = {PMLR},\n pdf = \t {https://proceedings.mlr.press/v162/eliasof22a/eliasof22a.pdf},\n url = \t {https://proceedings.mlr.press/v162/eliasof22a.html},\n abstract = \t {Graph Convolutional Networks (GCNs), similarly to Convolutional Neural Networks (CNNs), are typically based on two main operations - spatial and point-wise convolutions. In the context of GCNs, differently from CNNs, a pre-determined spatial operator based on the graph Laplacian is often chosen, allowing only the point-wise operations to be learnt. However, learning a meaningful spatial operator is critical for developing more expressive GCNs for improved performance. In this paper we propose pathGCN, a novel approach to learn the spatial operator from random paths on the graph. We analyze the convergence of our method and its difference from existing GCNs. Furthermore, we discuss several options of combining our learnt spatial operator with point-wise convolutions. Our extensive experiments on numerous datasets suggest that by properly learning both the spatial and point-wise convolutions, phenomena like over-smoothing can be inherently avoided, and new state-of-the-art performance is achieved.}\n}", "pdf": "https://proceedings.mlr.press/v162/eliasof22a/eliasof22a.pdf", "supp": "https://media.icml.cc/Conferences/ICML2022/supplementary/eliasof22a-supp.zip", "pdf_size": 2042643, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6367968240585802827&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Ben-Gurion University, Israel; Department of Earth, Ocean and Atmospheric Sciences, University of British Columbia, Canada; Department of Computer Science, Ben-Gurion University, Israel", "aff_domain": "post.bgu.ac.il; ;cs.bgu.ac.il", "email": "post.bgu.ac.il; ;cs.bgu.ac.il", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v162/eliasof22a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Ben-Gurion University;University of British Columbia", "aff_unique_dep": "Department of Computer Science;Department of Earth, Ocean and Atmospheric Sciences", "aff_unique_url": "https://www.bgu.ac.il;https://www.ubc.ca", "aff_unique_abbr": "BGU;UBC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Israel;Canada" } ]